X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/ac455f4ca4256dea64dffc4369293851b66f2601..6cf2ebbb422b9b31788168945caf59bce99e5764:/regcomp.c

diff --git a/regcomp.c b/regcomp.c
index dcd9856..a067af6 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3048,20 +3048,17 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                 }
                             } else {
 /* 
-    Currently we do not believe that the trie logic can
-    handle case insensitive matching properly when the
-    pattern is not unicode (thus forcing unicode semantics).
+    Currently the trie logic handles case insensitive matching properly only
+    when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
+    semantics).
 
     If/when this is fixed the following define can be swapped
     in below to fully enable trie logic.
 
-    XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps
-    not /aa
-
 #define TRIE_TYPE_IS_SAFE 1
 
 */
-#define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT)
+#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
 
                                 if ( last && TRIE_TYPE_IS_SAFE ) {
                                     make_trie( pRExC_state, 
@@ -5830,13 +5827,15 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
  * this file.  An inversion list is here implemented as a malloc'd C array with
  * some added info.  More will be coming when functionality is added later.
  *
+ * It is currently implemented as an HV to the outside world, but is actually
+ * an SV pointing to an array of UVs that the SV thinks are bytes.  This allows
+ * us to have an array of UV whose memory management is automatically handled
+ * by the existing facilities for SV's.
+ *
  * Some of the methods should always be private to the implementation, and some
  * should eventually be made public */
 
 #define INVLIST_INITIAL_LEN 10
-#define INVLIST_ARRAY_KEY "array"
-#define INVLIST_MAX_KEY "max"
-#define INVLIST_LEN_KEY "len"
 
 PERL_STATIC_INLINE UV*
 S_invlist_array(pTHX_ HV* const invlist)
@@ -5845,30 +5844,9 @@ S_invlist_array(pTHX_ HV* const invlist)
      * length changes, this needs to be called in case malloc or realloc moved
      * it */
 
-    SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
-
     PERL_ARGS_ASSERT_INVLIST_ARRAY;
 
-    if (list_ptr == NULL) {
-	Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
-							    INVLIST_ARRAY_KEY);
-    }
-
-    return INT2PTR(UV *, SvUV(*list_ptr));
-}
-
-PERL_STATIC_INLINE void
-S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
-{
-    PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
-
-    /* Sets the array stored in the inversion list to the memory beginning with
-     * the parameter */
-
-    if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
-	Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
-							    INVLIST_ARRAY_KEY);
-    }
+    return (UV *) SvPVX(invlist);
 }
 
 PERL_STATIC_INLINE UV
@@ -5876,16 +5854,9 @@ S_invlist_len(pTHX_ HV* const invlist)
 {
     /* Returns the current number of elements in the inversion list's array */
 
-    SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
-
     PERL_ARGS_ASSERT_INVLIST_LEN;
 
-    if (len_ptr == NULL) {
-	Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
-							    INVLIST_LEN_KEY);
-    }
-
-    return SvUV(*len_ptr);
+    return SvCUR(invlist) / sizeof(UV);
 }
 
 PERL_STATIC_INLINE UV
@@ -5894,16 +5865,9 @@ S_invlist_max(pTHX_ HV* const invlist)
     /* Returns the maximum number of elements storable in the inversion list's
      * array, without having to realloc() */
 
-    SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
-
     PERL_ARGS_ASSERT_INVLIST_MAX;
 
-    if (max_ptr == NULL) {
-	Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
-							    INVLIST_MAX_KEY);
-    }
-
-    return SvUV(*max_ptr);
+    return SvLEN(invlist) / sizeof(UV);
 }
 
 PERL_STATIC_INLINE void
@@ -5913,14 +5877,7 @@ S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
 
     PERL_ARGS_ASSERT_INVLIST_SET_LEN;
 
-    if (len != 0 && len > invlist_max(invlist)) {
-	Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
-    }
-
-    if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
-	Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
-							    INVLIST_LEN_KEY);
-    }
+    SvCUR_set(invlist, len * sizeof(UV));
 }
 
 PERL_STATIC_INLINE void
@@ -5933,13 +5890,10 @@ S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
     PERL_ARGS_ASSERT_INVLIST_SET_MAX;
 
     if (max < invlist_len(invlist)) {
-	Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
+	Perl_croak(aTHX_ "panic: Can't make max size '%"UVuf"' less than current length %"UVuf" in inversion list", invlist_max(invlist), invlist_len(invlist));
     }
 
-    if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
-	Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
-							    INVLIST_LEN_KEY);
-    }
+    SvLEN_set(invlist, max * sizeof(UV));
 }
 
 #ifndef PERL_IN_XSUB_RE
@@ -5951,22 +5905,12 @@ Perl__new_invlist(pTHX_ IV initial_size)
      * space to store 'initial_size' elements.  If that number is negative, a
      * system default is used instead */
 
-    HV* invlist = newHV();
-    UV* list;
-
     if (initial_size < 0) {
 	initial_size = INVLIST_INITIAL_LEN;
     }
 
     /* Allocate the initial space */
-    Newx(list, initial_size, UV);
-    invlist_set_array(invlist, list);
-
-    /* set_len has to come before set_max, as the latter inspects the len */
-    invlist_set_len(invlist, 0);
-    invlist_set_max(invlist, initial_size);
-
-    return invlist;
+    return (HV *) newSV(initial_size * sizeof(UV));
 }
 #endif
 
@@ -5975,41 +5919,19 @@ S_invlist_destroy(pTHX_ HV* const invlist)
 {
    /* Inversion list destructor */
 
-    SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
-
     PERL_ARGS_ASSERT_INVLIST_DESTROY;
 
-    if (list_ptr != NULL) {
-	UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
-	Safefree(list);
-    }
+    SvREFCNT_dec(invlist);
 }
 
 STATIC void
 S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
 {
-    /* Change the maximum size of an inversion list (up or down) */
-
-    UV* orig_array;
-    UV* array;
-    const UV old_max = invlist_max(invlist);
+    /* Grow the maximum size of an inversion list */
 
     PERL_ARGS_ASSERT_INVLIST_EXTEND;
 
-    if (old_max == new_max) {	/* If a no-op */
-	return;
-    }
-
-    array = orig_array = invlist_array(invlist);
-    Renew(array, new_max, UV);
-
-    /* If the size change moved the list in memory, set the new one */
-    if (array != orig_array) {
-	invlist_set_array(invlist, array);
-    }
-
-    invlist_set_max(invlist, new_max);
-
+    SvGROW((SV *)invlist, new_max * sizeof(UV));
 }
 
 PERL_STATIC_INLINE void
@@ -6020,13 +5942,14 @@ S_invlist_trim(pTHX_ HV* const invlist)
     /* Change the length of the inversion list to how many entries it currently
      * has */
 
-    invlist_extend(invlist, invlist_len(invlist));
+    SvPV_shrink_to_cur((SV *) invlist);
 }
 
 /* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
  * etc */
 
 #define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
+#define PREV_ELEMENT_IN_INVLIST_SET(i) ! ELEMENT_IN_INVLIST_SET(i)
 
 #ifndef PERL_IN_XSUB_RE
 void
@@ -6193,9 +6116,9 @@ S_invlist_union(pTHX_ HV* const a, HV* const b)
     /* Here, we are finished going through at least one of the lists, which
      * means there is something remaining in at most one.  We check if the list
      * that hasn't been exhausted is positioned such that we are in the middle
-     * of a range in its set or not.  (We are in the set if the next item in
-     * the array marks the beginning of something not in the set)   If in the
-     * set, we decrement 'count'; if 0, there is potentially more to output.
+     * of a range in its set or not.  (i_a and i_b point to the element beyond
+     * the one we care about.) If in the set, we decrement 'count'; if 0, there
+     * is potentially more to output.
      * There are four cases:
      *	1) Both weren't in their sets, count is 0, and remains 0.  What's left
      *	   in the union is entirely from the non-exhausted set.
@@ -6205,12 +6128,12 @@ S_invlist_union(pTHX_ HV* const a, HV* const b)
      *	   that
      *	3) the exhausted was in its set, non-exhausted isn't, count is 1.
      *	   Nothing further should be output because the union includes
-     *	   everything from the exhausted set.  Not decrementing insures that.
+     *	   everything from the exhausted set.  Not decrementing ensures that.
      *	4) the exhausted wasn't in its set, non-exhausted is, count is 1;
      *	   decrementing to 0 insures that we look at the remainder of the
      *	   non-exhausted set */
-    if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
-	|| (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
+    if ((i_a != len_a && PREV_ELEMENT_IN_INVLIST_SET(i_a))
+	|| (i_b != len_b && PREV_ELEMENT_IN_INVLIST_SET(i_b)))
     {
 	count--;
     }
@@ -6258,7 +6181,8 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b)
     /* Return the intersection of two inversion lists.  The basis for this
      * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
      * by Addison-Wesley, and explained at some length there.  The preface says
-     * to incorporate its examples into your code at your own risk.
+     * to incorporate its examples into your code at your own risk.  In fact,
+     * it had bugs
      *
      * The algorithm is like a merge sort, and is essentially the same as the
      * union above
@@ -6299,17 +6223,17 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b)
 		       array */
 	bool cp_in_set;	/* Is it in the input list's set or not */
 
-	/* We need to take one or the other of the two inputs for the union.
-	 * Since we are merging two sorted lists, we take the smaller of the
-	 * next items.  In case of a tie, we take the one that is not in its
-	 * set first (a difference from the union algorithm).  If we took one
-	 * in the set first, it would increment the count, possibly to 2 which
-	 * would cause it to be output as starting a range in the intersection,
-	 * and the next time through we would take that same number, and output
-	 * it again as ending the set.  By doing it the opposite of this, we
-	 * there is no possibility that the count will be momentarily
-	 * incremented to 2.  (In a tie and both are in the set or both not in
-	 * the set, it doesn't matter which we take first.) */
+	/* We need to take one or the other of the two inputs for the
+	 * intersection.  Since we are merging two sorted lists, we take the
+	 * smaller of the next items.  In case of a tie, we take the one that
+	 * is not in its set first (a difference from the union algorithm).  If
+	 * we took one in the set first, it would increment the count, possibly
+	 * to 2 which would cause it to be output as starting a range in the
+	 * intersection, and the next time through we would take that same
+	 * number, and output it again as ending the set.  By doing it the
+	 * opposite of this, there is no possibility that the count will be
+	 * momentarily incremented to 2.  (In a tie and both are in the set or
+	 * both not in the set, it doesn't matter which we take first.) */
 	if (array_a[i_a] < array_b[i_b]
 	    || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
 	{
@@ -6338,19 +6262,32 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b)
 	}
     }
 
-    /* Here, we are finished going through at least one of the sets, which
-     * means there is something remaining in at most one.  See the comments in
-     * the union code */
-    if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
-	|| (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
+    /* Here, we are finished going through at least one of the lists, which
+     * means there is something remaining in at most one.  We check if the list
+     * that has been exhausted is positioned such that we are in the middle
+     * of a range in its set or not.  (i_a and i_b point to elements 1 beyond
+     * the ones we care about.)  There are four cases:
+     *	1) Both weren't in their sets, count is 0, and remains 0.  There's
+     *	   nothing left in the intersection.
+     *	2) Both were in their sets, count is 2 and perhaps is incremented to
+     *	   above 2.  What should be output is exactly that which is in the
+     *	   non-exhausted set, as everything it has is also in the intersection
+     *	   set, and everything it doesn't have can't be in the intersection
+     *	3) The exhausted was in its set, non-exhausted isn't, count is 1, and
+     *	   gets incremented to 2.  Like the previous case, the intersection is
+     *	   everything that remains in the non-exhausted set.
+     *	4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
+     *	   remains 1.  And the intersection has nothing more. */
+    if ((i_a == len_a && PREV_ELEMENT_IN_INVLIST_SET(i_a))
+	|| (i_b == len_b && PREV_ELEMENT_IN_INVLIST_SET(i_b)))
     {
-	count--;
+	count++;
     }
 
     /* The final length is what we've output so far plus what else is in the
-     * intersection.  Only one of the subexpressions below will be non-zero */
+     * intersection.  At most one of the subexpressions below will be non-zero */
     len_r = i_r;
-    if (count == 2) {
+    if (count >= 2) {
 	len_r += (len_a - i_a) + (len_b - i_b);
     }
 
@@ -6363,7 +6300,7 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b)
     }
 
     /* Finish outputting any remaining */
-    if (count == 2) { /* Only one of will have a non-zero copy count */
+    if (count >= 2) { /* At most one will have a non-zero copy count */
 	IV copy_count;
 	if ((copy_count = len_a - i_a) > 0) {
 	    Copy(array_a + i_a, array_r + i_r, copy_count, UV);
@@ -6716,6 +6653,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                             SvIV_set(sv_dat, 1);
                         }
 #ifdef DEBUGGING
+			/* Yes this does cause a memory leak in debugging Perls */
                         if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
                             SvREFCNT_dec(svname);
 #endif
@@ -7504,7 +7442,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
     const char * const origparse = RExC_parse;
     I32 min;
     I32 max = REG_INFTY;
+#ifdef RE_TRACK_PATTERN_OFFSETS
     char *parse_start;
+#endif
     const char *maxpos = NULL;
     GET_RE_DEBUG_FLAGS_DECL;
 
@@ -7523,7 +7463,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
     if (op == '{' && regcurly(RExC_parse)) {
 	maxpos = NULL;
+#ifdef RE_TRACK_PATTERN_OFFSETS
         parse_start = RExC_parse; /* MJD */
+#endif
 	next = RExC_parse + 1;
 	while (isDIGIT(*next) || *next == ',') {
 	    if (*next == ',') {
@@ -7619,7 +7561,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
       vFAIL("Regexp *+ operand could be empty");
 #endif
 
+#ifdef RE_TRACK_PATTERN_OFFSETS
     parse_start = RExC_parse;
+#endif
     nextchar(pRExC_state);
 
     *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
@@ -8761,6 +8705,7 @@ tryagain:
 			    break;
 			case 's':
 			case 'S':
+			case 0x17F: /* LATIN SMALL LETTER LONG S */
 			     if (AT_LEAST_UNI_SEMANTICS) {
 				if (latest_char_state == char_s) {  /* 'ss' */
 				    ender = LATIN_SMALL_LETTER_SHARP_S;
@@ -8818,6 +8763,9 @@ tryagain:
 			    latest_char_state = generic_char;
 			    break;
 			case 0x03C5:	/* First char in upsilon series */
+			case 0x03A5:	/* Also capital UPSILON, which folds to
+					   03C5, and hence exhibits the same
+					   problem */
 			    if (p < RExC_end - 4) { /* Need >= 4 bytes left */
 				latest_char_state = upsilon_1;
 				if (len != 0) {
@@ -8830,6 +8778,10 @@ tryagain:
 			    }
 			    break;
 			case 0x03B9:	/* First char in iota series */
+			case 0x0399:	/* Also capital IOTA */
+			case 0x1FBE:	/* GREEK PROSGEGRAMMENI folds to 3B9 */
+			case 0x0345:	/* COMBINING GREEK YPOGEGRAMMENI folds
+					   to 3B9 */
 			    if (p < RExC_end - 4) {
 				latest_char_state = iota_1;
 				if (len != 0) {
@@ -8865,13 +8817,14 @@ tryagain:
 			    break;
 
 			/* These are the tricky fold characters.  Flush any
-			 * buffer first. */
+			 * buffer first. (When adding to this list, also should
+			 * add them to fold_grind.t to make sure get tested) */
 			case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
 			case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
 			case LATIN_SMALL_LETTER_SHARP_S:
 			case LATIN_CAPITAL_LETTER_SHARP_S:
-			case 0x1FD3:
-			case 0x1FE3:
+			case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */
+			case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */
 			    if (len != 0) {
 				p = oldp;
 				goto loopdone;
@@ -10268,7 +10221,10 @@ parseit:
     /* Combine the two lists into one. */
     if (l1_fold_invlist) {
 	if (nonbitmap) {
-	    nonbitmap = invlist_union(nonbitmap, l1_fold_invlist);
+	    HV* temp = invlist_union(nonbitmap, l1_fold_invlist);
+	    invlist_destroy(nonbitmap);
+	    nonbitmap = temp;
+	    invlist_destroy(l1_fold_invlist);
 	}
 	else {
 	    nonbitmap = l1_fold_invlist;
@@ -11732,12 +11688,11 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
     dVAR;
     struct regexp *const r = (struct regexp *)SvANY(rx);
     regexp_internal *reti;
-    int len, npar;
+    int len;
     RXi_GET_DECL(r,ri);
 
     PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
     
-    npar = r->nparens+1;
     len = ProgLen(ri);
     
     Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);