X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/7eec73eb790f7c4982edfc28c17c011e8a072490..d555b9dd665a725cb821a516235ceaa30169c3e6:/regcomp.c diff --git a/regcomp.c b/regcomp.c index a62e0b7..2003d3d 100644 --- a/regcomp.c +++ b/regcomp.c @@ -8324,33 +8324,47 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * as an SVt_INVLIST scalar. * * An inversion list for Unicode is an array of code points, sorted by ordinal - * number. The zeroth element is the first code point in the list. The 1th - * element is the first element beyond that not in the list. In other words, - * the first range is - * invlist[0]..(invlist[1]-1) - * The other ranges follow. Thus every element whose index is divisible by two - * marks the beginning of a range that is in the list, and every element not - * divisible by two marks the beginning of a range not in the list. A single - * element inversion list that contains the single code point N generally - * consists of two elements - * invlist[0] == N - * invlist[1] == N+1 - * (The exception is when N is the highest representable value on the - * machine, in which case the list containing just it would be a single - * element, itself. By extension, if the last range in the list extends to - * infinity, then the first element of that range will be in the inversion list - * at a position that is divisible by two, and is the final element in the - * list.) + * number. Each element gives the code point that begins a range that extends + * up-to but not including the code point given by the next element. The final + * element gives the first code point of a range that extends to the platform's + * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4], + * ...) give ranges whose code points are all in the inversion list. We say + * that those ranges are in the set. The odd-numbered elements give ranges + * whose code points are not in the inversion list, and hence not in the set. + * Thus, element [0] is the first code point in the list. Element [1] + * is the first code point beyond that not in the list; and element [2] is the + * first code point beyond that that is in the list. In other words, the first + * range is invlist[0]..(invlist[1]-1), and all code points in that range are + * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and + * all code points in that range are not in the inversion list. The third + * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion + * list, and so forth. Thus every element whose index is divisible by two + * gives the beginning of a range that is in the list, and every element whose + * index is not divisible by two gives the beginning of a range not in the + * list. If the final element's index is divisible by two, the inversion list + * extends to the platform's infinity; otherwise the highest code point in the + * inversion list is the contents of that element minus 1. + * + * A range that contains just a single code point N will look like + * invlist[i] == N + * invlist[i+1] == N+1 + * + * If N is UV_MAX (the highest representable code point on the machine), N+1 is + * impossible to represent, so element [i+1] is omitted. The single element + * inversion list + * invlist[0] == UV_MAX + * contains just UV_MAX, but is interpreted as matching to infinity. + * * Taking the complement (inverting) an inversion list is quite simple, if the * first element is 0, remove it; otherwise add a 0 element at the beginning. * This implementation reserves an element at the beginning of each inversion * list to always contain 0; there is an additional flag in the header which * indicates if the list begins at the 0, or is offset to begin at the next - * element. + * element. This means that the inversion list can be inverted without any + * copying; just flip the flag. * * More about inversion lists can be found in "Unicode Demystified" * Chapter 13 by Richard Gillam, published by Addison-Wesley. - * More will be coming when functionality is added later. * * The inversion list data structure is currently implemented as an SV pointing * to an array of UVs that the SV thinks are bytes. This allows us to have an @@ -8671,7 +8685,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, UV final_element = len - 1; array = invlist_array(invlist); - if (array[final_element] > start + if ( array[final_element] > start || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) { Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", @@ -8679,10 +8693,10 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); } - /* Here, it is a legal append. If the new range begins with the first - * value not in the set, it is extending the set, so the new first - * value not in the set is one greater than the newly extended range. - * */ + /* Here, it is a legal append. If the new range begins 1 above the end + * of the range below it, it is extending the range below it, so the + * new first value not in the set is one greater than the newly + * extended range. */ offset = *get_invlist_offset_addr(invlist); if (array[final_element] == start) { if (end != UV_MAX) { @@ -8690,7 +8704,8 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, } else { /* But if the end is the maximum representable on the machine, - * just let the range that this would extend to have no end */ + * assume that infinity was actually what was meant. Just let + * the range that this would extend to have no end */ invlist_set_len(invlist, len - 1, offset); } return; @@ -8730,7 +8745,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, #ifndef PERL_IN_XSUB_RE -IV +SSize_t Perl__invlist_search(SV* const invlist, const UV cp) { /* Searches the inversion list for the entry that contains the input code @@ -8949,16 +8964,14 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, UV i_b = 0; UV i_u = 0; - bool has_something_from_a = FALSE; - bool has_something_from_b = FALSE; - - /* running count, as explained in the algorithm source book; items are * stopped accumulating and are output when the count changes to/from 0. - * The count is incremented when we start a range that's in the set, and - * decremented when we start a range that's not in the set. So its range - * is 0 to 2. Only when the count is zero is something not in the set. - */ + * The count is incremented when we start a range that's in an input's set, + * and decremented when we start a range that's not in a set. So this + * variable can be 0, 1, or 2. When it is 0 neither input is in their set, + * and hence nothing goes into the union; 1, just one of the inputs is in + * its set (and its current range gets added to the union); and 2 when both + * inputs are in their sets. */ UV count = 0; PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; @@ -9021,7 +9034,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, invlist_replace_list_destroys_src(*output, u); SvREFCNT_dec_NN(u); - return; + return; } if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { @@ -9092,10 +9105,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, u = _new_invlist(len_a + len_b); /* Will contain U+0000 if either component does */ - array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0) - || (len_b > 0 && array_b[0] == 0)); + array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0) + || (len_b > 0 && array_b[0] == 0)); - /* Go through each list item by item, stopping when exhausted one of + /* Go through each input list item by item, stopping when exhausted one of * them */ while (i_a < len_a && i_b < len_b) { UV cp; /* The element to potentially add to the union's array */ @@ -9103,27 +9116,25 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* We need to take one or the other of the two inputs for the union. * Since we are merging two sorted lists, we take the smaller of the - * next items. In case of a tie, we take the one that is in its set - * first. If we took one not in the set first, it would decrement the - * count, possibly to 0 which would cause it to be output as ending the - * range, and the next time through we would take the same number, and - * output it again as beginning the next range. By doing it the - * opposite way, there is no possibility that the count will be - * momentarily decremented to 0, and thus the two adjoining ranges will - * be seamlessly merged. (In a tie and both are in the set or both not - * in the set, it doesn't matter which we take first.) */ - if (array_a[i_a] < array_b[i_b] - || (array_a[i_a] == array_b[i_b] + * next items. In case of a tie, we take first the one that is in its + * set. If we first took the one not in its set, it would decrement + * the count, possibly to 0 which would cause it to be output as ending + * the range, and the next time through we would take the same number, + * and output it again as beginning the next range. By doing it the + * opposite way, there is no possibility that the count will be + * momentarily decremented to 0, and thus the two adjoining ranges will + * be seamlessly merged. (In a tie and both are in the set or both not + * in the set, it doesn't matter which we take first.) */ + if ( array_a[i_a] < array_b[i_b] + || ( array_a[i_a] == array_b[i_b] && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) { cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); - cp= array_a[i_a++]; - has_something_from_a = TRUE; + cp = array_a[i_a++]; } else { cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); cp = array_b[i_b++]; - has_something_from_b = TRUE; } /* Here, have chosen which of the two inputs to look at. Only output @@ -9143,83 +9154,53 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* Here, we are finished going through at least one of the lists, which - * means there is something remaining in at most one. We check if the list - * that hasn't been exhausted is positioned such that we are in the middle - * of a range in its set or not. (i_a and i_b point to the element beyond - * the one we care about.) If in the set, we decrement 'count'; if 0, there - * is potentially more to output. - * There are four cases: - * 1) Both weren't in their sets, count is 0, and remains 0. What's left - * in the union is entirely from the non-exhausted set. - * 2) Both were in their sets, count is 2. Nothing further should - * be output, as everything that remains will be in the exhausted - * list's set, hence in the union; decrementing to 1 but not 0 insures - * that - * 3) the exhausted was in its set, non-exhausted isn't, count is 1. - * Nothing further should be output because the union includes - * everything from the exhausted set. Not decrementing ensures that. - * 4) the exhausted wasn't in its set, non-exhausted is, count is 1; - * decrementing to 0 insures that we look at the remainder of the - * non-exhausted set */ + + /* The loop above increments the index into exactly one of the input lists + * each iteration, and ends when either index gets to its list end. That + * means the other index is lower than its end, and so something is + * remaining in that one. We decrement 'count', as explained below, if + * that list is in its set. (i_a and i_b each currently index the element + * beyond the one we care about.) */ if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) { count--; } - /* The final length is what we've output so far, plus what else is about to - * be output. (If 'count' is non-zero, then the input list we exhausted - * has everything remaining up to the machine's limit in its set, and hence - * in the union, so there will be no further output. */ + /* Above we decremented 'count' if the list that had unexamined elements in + * it was in its set. This has made it so that 'count' being non-zero + * means there isn't anything left to output; and 'count' equal to 0 means + * that what is left to output is precisely that which is left in the + * non-exhausted input list. + * + * To see why, note first that the exhausted input obviously has nothing + * left to add to the union. If it was in its set at its end, that means + * the set extends from here to the platform's infinity, and hence so does + * the union and the non-exhausted set is irrelevant. The exhausted set + * also contributed 1 to 'count'. If 'count' was 2, it got decremented to + * 1, but if it was 1, the non-exhausted set wasn't in its set, and so + * 'count' remains at 1. This is consistent with the decremented 'count' + * != 0 meaning there's nothing left to add to the union. + * + * But if the exhausted input wasn't in its set, it contributed 0 to + * 'count', and the rest of the union will be whatever the other input is. + * If 'count' was 0, neither list was in its set, and 'count' remains 0; + * otherwise it gets decremented to 0. This is consistent with 'count' + * == 0 meaning the remainder of the union is whatever is left in the + * non-exhausted list. */ if (count != 0) { - - /* Here, there is nothing left to put in the union. If the union came - * only from the input that it is to overwrite, this whole operation is - * a no-op */ - if ( UNLIKELY(! has_something_from_b && *output == a) - || UNLIKELY(! has_something_from_a && *output == b)) - { - SvREFCNT_dec_NN(u); - return; - } - len_u = i_u; } else { - /* When 'count' is 0, the list that was exhausted (if one was shorter - * than the other) ended with everything above it not in its set. That - * means that the remaining part of the union is precisely the same as - * the non-exhausted list, so can just copy it unchanged. If only one - * of the inputs contributes to the union, and the output is to - * overwite that particular input, then this whole operation was a - * no-op. */ - IV copy_count = len_a - i_a; - if (copy_count > 0) { - if (UNLIKELY(! has_something_from_b && *output == a)) { - SvREFCNT_dec_NN(u); - return; - } + if (copy_count > 0) { /* The non-exhausted input is 'a' */ Copy(array_a + i_a, array_u + i_u, copy_count, UV); - len_u = i_u + copy_count; } - else if ((copy_count = len_b - i_b) > 0) { - if (UNLIKELY(! has_something_from_a && *output == b)) { - SvREFCNT_dec_NN(u); - return; - } + else { /* The non-exhausted input is b */ + copy_count = len_b - i_b; Copy(array_b + i_b, array_u + i_u, copy_count, UV); - len_u = i_u + copy_count; - } else if ( UNLIKELY(! has_something_from_b && *output == a) - || UNLIKELY(! has_something_from_a && *output == b)) - { - /* Here, both arrays are exhausted, so no need to do any additional - * copying. Also here, the union came only from the input that it is - * to overwrite, so this whole operation is a no-op */ - SvREFCNT_dec_NN(u); - return; } + len_u = i_u + copy_count; } /* Set the result to the final length, which can change the pointer to @@ -9243,7 +9224,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * shown [perl #127392] that if the input is a mortal, we can get a * huge build-up of these during regex compilation before they get * freed. So for that case, replace just the input's interior with - * the output's, and then free the output */ + * the union's, and then free the union */ assert(! invlist_is_iterating(*output)); @@ -9295,17 +9276,14 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, UV i_b = 0; UV i_r = 0; - /* running count, as explained in the algorithm source book; items are - * stopped accumulating and are output when the count changes to/from 2. - * The count is incremented when we start a range that's in the set, and - * decremented when we start a range that's not in the set. So its range - * is 0 to 2. Only when the count is 2 is something in the intersection. - */ + /* running count of how many of the two inputs are postitioned at ranges + * that are in their sets. As explained in the algorithm source book, + * items are stopped accumulating and are output when the count changes + * to/from 2. The count is incremented when we start a range that's in an + * input's set, and decremented when we start a range that's not in a set. + * Only when it is 2 are we in the intersection. */ UV count = 0; - bool has_something_from_a = FALSE; - bool has_something_from_b = FALSE; - PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; assert(a != b); @@ -9375,8 +9353,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, r= _new_invlist(len_a + len_b); /* Will contain U+0000 iff both components do */ - array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 - && len_b > 0 && array_b[0] == 0); + array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 + && len_b > 0 && array_b[0] == 0); /* Go through each list item by item, stopping when exhausted one of * them */ @@ -9387,27 +9365,25 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* We need to take one or the other of the two inputs for the * intersection. Since we are merging two sorted lists, we take the - * smaller of the next items. In case of a tie, we take the one that - * is not in its set first (a difference from the union algorithm). If - * we took one in the set first, it would increment the count, possibly - * to 2 which would cause it to be output as starting a range in the - * intersection, and the next time through we would take that same - * number, and output it again as ending the set. By doing it the - * opposite of this, there is no possibility that the count will be - * momentarily incremented to 2. (In a tie and both are in the set or - * both not in the set, it doesn't matter which we take first.) */ - if (array_a[i_a] < array_b[i_b] - || (array_a[i_a] == array_b[i_b] + * smaller of the next items. In case of a tie, we take first the one + * that is not in its set (a difference from the union algorithm). If + * we first took the one in its set, it would increment the count, + * possibly to 2 which would cause it to be output as starting a range + * in the intersection, and the next time through we would take that + * same number, and output it again as ending the set. By doing the + * opposite of this, there is no possibility that the count will be + * momentarily incremented to 2. (In a tie and both are in the set or + * both not in the set, it doesn't matter which we take first.) */ + if ( array_a[i_a] < array_b[i_b] + || ( array_a[i_a] == array_b[i_b] && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) { cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); - cp= array_a[i_a++]; - has_something_from_a = TRUE; + cp = array_a[i_a++]; } else { cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); cp= array_b[i_b++]; - has_something_from_b = TRUE; } /* Here, have chosen which of the two inputs to look at. Only output @@ -9425,76 +9401,55 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } count--; } + } - /* Here, we are finished going through at least one of the lists, which - * means there is something remaining in at most one. We check if the list - * that has been exhausted is positioned such that we are in the middle - * of a range in its set or not. (i_a and i_b point to elements 1 beyond - * the ones we care about.) There are four cases: - * 1) Both weren't in their sets, count is 0, and remains 0. There's - * nothing left in the intersection. - * 2) Both were in their sets, count is 2 and perhaps is incremented to - * above 2. What should be output is exactly that which is in the - * non-exhausted set, as everything it has is also in the intersection - * set, and everything it doesn't have can't be in the intersection - * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and - * gets incremented to 2. Like the previous case, the intersection is - * everything that remains in the non-exhausted set. - * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and - * remains 1. And the intersection has nothing more. */ + /* The loop above increments the index into exactly one of the input lists + * each iteration, and ends when either index gets to its list end. That + * means the other index is lower than its end, and so something is + * remaining in that one. We increment 'count', as explained below, if the + * exhausted list was in its set. (i_a and i_b each currently index the + * element beyond the one we care about.) */ if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) - || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) { count++; } - if (count < 2) { - - /* Here, there is nothing left to put in the intersection. If the - * intersection came only from the input that it is to overwrite, this - * whole operation is a no-op */ - if ( UNLIKELY(! has_something_from_b && *i == a) - || UNLIKELY(! has_something_from_a && *i == b)) - { - SvREFCNT_dec_NN(r); - return; - } - + /* Above we incremented 'count' if the exhausted list was in its set. This + * has made it so that 'count' being below 2 means there is nothing left to + * output; otheriwse what's left to add to the intersection is precisely + * that which is left in the non-exhausted input list. + * + * To see why, note first that the exhausted input obviously has nothing + * left to affect the intersection. If it was in its set at its end, that + * means the set extends from here to the platform's infinity, and hence + * anything in the non-exhausted's list will be in the intersection, and + * anything not in it won't be. Hence, the rest of the intersection is + * precisely what's in the non-exhausted list The exhausted set also + * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing + * it means 'count' is now at least 2. This is consistent with the + * incremented 'count' being >= 2 means to add the non-exhausted list to + * the intersection. + * + * But if the exhausted input wasn't in its set, it contributed 0 to + * 'count', and the intersection can't include anything further; the + * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get + * incremented. This is consistent with 'count' being < 2 meaning nothing + * further to add to the intersection. */ + if (count < 2) { /* Nothing left to put in the intersection. */ len_r = i_r; } - else { - /* When 'count' is 2 or more, the list that was exhausted, what remains - * in the intersection is precisely the same as the non-exhausted list, - * so can just copy it unchanged. If only one of the inputs - * contributes to the intersection, and the output is to overwite that - * particular input, then this whole operation was a no-op. */ - + else { /* copy the non-exhausted list, unchanged. */ IV copy_count = len_a - i_a; - if (copy_count > 0) { - if (UNLIKELY(! has_something_from_b && *i == a)) { - SvREFCNT_dec_NN(r); - return; - } + if (copy_count > 0) { /* a is the one with stuff left */ Copy(array_a + i_a, array_r + i_r, copy_count, UV); - len_r = i_r + copy_count; } - else if ((copy_count = len_b - i_b) > 0) { - if (UNLIKELY(! has_something_from_a && *i == b)) { - SvREFCNT_dec_NN(r); - return; - } + else { /* b is the one with stuff left */ + copy_count = len_b - i_b; Copy(array_b + i_b, array_r + i_r, copy_count, UV); - len_r = i_r + copy_count; - } else if ( UNLIKELY(! has_something_from_b && *i == a) - || UNLIKELY(! has_something_from_a && *i == b)) - { - /* Here, both arrays are exhausted, so no need to do any additional - * copying. Also here, the intersection came only from the input - * that it is to overwrite, so this whole operation is a no-op */ - SvREFCNT_dec_NN(r); - return; } + len_r = i_r + copy_count; } /* Set the result to the final length, which can change the pointer to @@ -9506,6 +9461,17 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, array_r = invlist_array(r); } + /* Finish outputting any remaining */ + if (count >= 2) { /* At most one will have a non-zero copy count */ + IV copy_count; + if ((copy_count = len_a - i_a) > 0) { + Copy(array_a + i_a, array_r + i_r, copy_count, UV); + } + else if ((copy_count = len_b - i_b) > 0) { + Copy(array_b + i_b, array_r + i_r, copy_count, UV); + } + } + /* If the output is not to overwrite either of the inputs, just return the * calculated intersection */ if (a != *i && b != *i) { @@ -11419,7 +11385,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); } else if (ret == NULL) - ret = latest; + ret = latest; *flagp |= flags&(HASWIDTH|POSTPONED); if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; @@ -12029,39 +11995,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, } -/* - * reg_recode - * - * It returns the code point in utf8 for the value in *encp. - * value: a code value in the source encoding - * encp: a pointer to an Encode object - * - * If the result from Encode is not a single character, - * it returns U+FFFD (Replacement character) and sets *encp to NULL. - */ -STATIC UV -S_reg_recode(pTHX_ const U8 value, SV **encp) -{ - STRLEN numlen = 1; - SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP); - const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); - const STRLEN newlen = SvCUR(sv); - UV uv = UNICODE_REPLACEMENT; - - PERL_ARGS_ASSERT_REG_RECODE; - - if (newlen) - uv = SvUTF8(sv) - ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) - : *(U8*)s; - - if (!newlen || numlen != newlen) { - uv = UNICODE_REPLACEMENT; - *encp = NULL; - } - return uv; -} - PERL_STATIC_INLINE U8 S_compute_EXACTish(RExC_state_t *pRExC_state) { @@ -12275,13 +12208,15 @@ S_backref_value(char *p) /* - regatom - the lowest level - Try to identify anything special at the start of the pattern. If there - is, then handle it as required. This may involve generating a single regop, - such as for an assertion; or it may involve recursing, such as to - handle a () structure. + Try to identify anything special at the start of the current parse position. + If there is, then handle it as required. This may involve generating a + single regop, such as for an assertion; or it may involve recursing, such as + to handle a () structure. If the string doesn't start with something special then we gobble up - as much literal text as we can. + as much literal text as we can. If we encounter a quantifier, we have to + back off the final literal character, as that quantifier applies to just it + and not to the whole string of literals. Once we have been able to handle whatever type of thing started the sequence, we return. @@ -12996,6 +12931,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) || UTF8_IS_INVARIANT(UCHARAT(RExC_parse)) || UTF8_IS_START(UCHARAT(RExC_parse))); + /* Here, we have a literal character. Find the maximal string of + * them in the input that we can fit into a single EXACTish node. + * We quit at the first non-literal or when the node gets full */ for (p = RExC_parse; len < upper_parse && p < RExC_end; len++) @@ -13126,9 +13064,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) vFAIL(error_msg); } ender = result; - if (IN_ENCODING && ender < 0x100) { - goto recode_encoding; - } if (ender > 0xff) { REQUIRE_UTF8(flagp); } @@ -13161,11 +13096,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (RExC_recode_x_to_native) { ender = LATIN1_TO_NATIVE(ender); } - else #endif - if (IN_ENCODING) { - goto recode_encoding; - } } else { REQUIRE_UTF8(flagp); @@ -13225,17 +13156,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) form_short_octal_warning(p, numlen)); } } - if (IN_ENCODING && ender < 0x100) - goto recode_encoding; - break; - recode_encoding: - if (! RExC_override_recoding) { - SV* enc = _get_encoding(); - ender = reg_recode((U8)ender, &enc); - if (!enc && PASS2) - ckWARNreg(p, "Invalid escape in the specified encoding"); - REQUIRE_UTF8(flagp); - } break; case '\0': if (p >= RExC_end) @@ -13259,7 +13179,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * something like "\b" */ if (len || (p > RExC_start && isALPHA_A(*(p -1)))) { RExC_parse = p + 1; - vFAIL("Unescaped left brace in regex is illegal"); + vFAIL("Unescaped left brace in regex is illegal here"); } /*FALLTHROUGH*/ default: /* A literal character */ @@ -13291,6 +13211,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * the node, close the node with just them, and set up to do * this character again next time through, when it will be the * only thing in its new node */ + if ((next_is_quantifier = ( LIKELY(p < RExC_end) && UNLIKELY(ISMULT2(p)))) && LIKELY(len)) @@ -13664,8 +13585,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_parse = p - 1; Set_Node_Cur_Length(ret, parse_start); RExC_parse = p; - skip_to_be_ignored_text(pRExC_state, &RExC_parse, - FALSE /* Don't force to /x */ ); { /* len is STRLEN which is unsigned, need to copy to signed */ IV iv = len; @@ -13677,6 +13596,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) break; } /* End of giant switch on input character */ + /* Position parse to next real character */ + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); + if (PASS2 && *RExC_parse == '{' && OP(ret) != SBOL && ! regcurly(RExC_parse)) { + ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here, passed through"); + } + return(ret); } @@ -14534,8 +14460,7 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, ADD_POSIX_WARNING(p, "there is no terminating ']'"); } - if (posix_warnings && av_top_index(RExC_warn_text) > -1) { - /* warn_text should only be true if posix_warnings is true */ + if (posix_warnings && RExC_warn_text && av_top_index(RExC_warn_text) > -1) { *posix_warnings = RExC_warn_text; } } @@ -16205,9 +16130,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } non_portable_endpoint++; - if (IN_ENCODING && value < 0x100) { - goto recode_encoding; - } break; case 'x': RExC_parse--; /* function expects to be pointed at the 'x' */ @@ -16225,8 +16147,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } non_portable_endpoint++; - if (IN_ENCODING && value < 0x100) - goto recode_encoding; break; case 'c': value = grok_bslash_c(*RExC_parse++, PASS2); @@ -16259,23 +16179,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } non_portable_endpoint++; - if (IN_ENCODING && value < 0x100) - goto recode_encoding; - break; - } - recode_encoding: - if (! RExC_override_recoding) { - SV* enc = _get_encoding(); - value = reg_recode((U8)value, &enc); - if (!enc) { - if (strict) { - vFAIL("Invalid escape in the specified encoding"); - } - else if (PASS2) { - ckWARNreg(RExC_parse, - "Invalid escape in the specified encoding"); - } - } break; } default: @@ -16416,9 +16319,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, else if (! SIZE_ONLY) { /* Here, not in pass1 (in that pass we skip calculating the - * contents of this class), and is /l, or is a POSIX class for - * which /l doesn't matter (or is a Unicode property, which is - * skipped here). */ + * contents of this class), and is not /l, or is a POSIX class + * for which /l doesn't matter (or is a Unicode property, which + * is skipped here). */ if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ @@ -16443,9 +16346,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, &cp_list); } } - else if (UNI_SEMANTICS + else if ( UNI_SEMANTICS || classnum == _CC_ASCII - || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT + || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT || classnum == _CC_XDIGIT))) { /* We usually have to worry about /d and /a affecting what @@ -17242,11 +17145,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SvREFCNT_dec_NN(cp_foldable_list); } - /* And combine the result (if any) with any inversion list from posix + /* And combine the result (if any) with any inversion lists from posix * classes. The lists are kept separate up to now because we don't want to * fold the classes (folding of those is automatically handled by the swash * fetching code) */ - if (simple_posixes) { + if (simple_posixes) { /* These are the classes known to be unaffected by + /a, /aa, and /d */ _invlist_union(cp_list, simple_posixes, &cp_list); SvREFCNT_dec_NN(simple_posixes); } @@ -18171,7 +18075,7 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; skip_to_be_ignored_text(pRExC_state, &RExC_parse, - FALSE /* Don't assume /x */ ); + FALSE /* Don't force /x */ ); } } @@ -18812,7 +18716,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ : TRIE_BITMAP(trie)), NULL, NULL, - NULL + NULL, + FALSE ); sv_catpvs(sv, "]"); } @@ -18911,6 +18816,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ /* And things that aren't in the bitmap, but are small enough to be */ SV* bitmap_range_not_in_bitmap = NULL; + const bool inverted = flags & ANYOF_INVERT; + if (OP(o) == ANYOFL) { if (ANYOFL_UTF8_LOCALE_REQD(flags)) { sv_catpvs(sv, "{utf8-locale-reqd}"); @@ -18955,21 +18862,37 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ ANYOF_BITMAP(o), bitmap_range_not_in_bitmap, only_utf8_locale_invlist, - o); + o, + + /* Can't try inverting for a + * better display if there are + * things that haven't been + * resolved */ + unresolved != NULL); SvREFCNT_dec(bitmap_range_not_in_bitmap); /* If there are user-defined properties which haven't been defined yet, - * output them, in a separate [] from the bitmap range stuff */ + * output them. If the result is not to be inverted, it is clearest to + * output them in a separate [] from the bitmap range stuff. If the + * result is to be complemented, we have to show everything in one [], + * as the inversion applies to the whole thing. Use {braces} to + * separate them from anything in the bitmap and anything above the + * bitmap. */ if (unresolved) { - if (do_sep) { - Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + if (inverted) { + if (! do_sep) { /* If didn't output anything in the bitmap */ + sv_catpvs(sv, "^"); + } + sv_catpvs(sv, "{"); } - if (flags & ANYOF_INVERT) { - sv_catpvs(sv, "^"); + else if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); } sv_catsv(sv, unresolved); - do_sep = TRUE; - SvREFCNT_dec_NN(unresolved); + if (inverted) { + sv_catpvs(sv, "}"); + } + do_sep = ! inverted; } /* And, finally, add the above-the-bitmap stuff */ @@ -18986,9 +18909,11 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); } - /* And, for easy of understanding, it is always output not-shown as - * complemented */ - if (flags & ANYOF_INVERT) { + /* And, for easy of understanding, it is shown in the + * uncomplemented form if possible. The one exception being if + * there are unresolved items, where the inversion has to be + * delayed until runtime */ + if (inverted && ! unresolved) { _invlist_invert(nonbitmap_invlist); _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist); } @@ -19025,6 +18950,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ /* And finally the matching, closing ']' */ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + + SvREFCNT_dec(unresolved); } else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; @@ -19838,30 +19765,36 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) * mnemonic names. Split off any of those at the beginning and end of * the range to print mnemonically. It isn't possible for many of * these to be in a row, so this won't overwhelm with output */ - while (isMNEMONIC_CNTRL(start) && start <= end) { - put_code_point(sv, start); - start++; - } - if (start < end && isMNEMONIC_CNTRL(end)) { - - /* Here, the final character in the range has a mnemonic name. - * Work backwards from the end to find the final non-mnemonic */ - UV temp_end = end - 1; - while (isMNEMONIC_CNTRL(temp_end)) { - temp_end--; + if ( start <= end + && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end))) + { + while (isMNEMONIC_CNTRL(start) && start <= end) { + put_code_point(sv, start); + start++; } - /* And separately output the interior range that doesn't start or - * end with mnemonics */ - put_range(sv, start, temp_end, FALSE); + /* If this didn't take care of the whole range ... */ + if (start <= end) { - /* Then output the mnemonic trailing controls */ - start = temp_end + 1; - while (start <= end) { - put_code_point(sv, start); - start++; + /* Look backwards from the end to find the final non-mnemonic + * */ + UV temp_end = end; + while (isMNEMONIC_CNTRL(temp_end)) { + temp_end--; + } + + /* And separately output the interior range that doesn't start + * or end with mnemonics */ + put_range(sv, start, temp_end, FALSE); + + /* Then output the mnemonic trailing controls */ + start = temp_end + 1; + while (start <= end) { + put_code_point(sv, start); + start++; + } + break; } - break; } /* As a final resort, output the range or subrange as hex. */ @@ -19949,7 +19882,9 @@ S_put_charclass_bitmap_innards_common(pTHX_ ) { /* Create and return an SV containing a displayable version of the bitmap - * and associated information determined by the input parameters. */ + * and associated information determined by the input parameters. If the + * output would have been only the inversion indicator '^', NULL is instead + * returned. */ SV * output; @@ -20008,9 +19943,8 @@ S_put_charclass_bitmap_innards_common(pTHX_ } } - /* If the only thing we output is the '^', clear it */ if (invert && SvCUR(output) == 1) { - SvCUR_set(output, 0); + return NULL; } return output; @@ -20021,7 +19955,8 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV *nonbitmap_invlist, SV *only_utf8_locale_invlist, - const regnode * const node) + const regnode * const node, + const bool force_as_is_display) { /* Appends to 'sv' a displayable version of the innards of the bracketed * character class defined by the other arguments: @@ -20037,13 +19972,16 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, * 'node' is the regex pattern node. It is needed only when the above two * parameters are not null, and is passed so that this routine can * tease apart the various reasons for them. + * 'force_as_is_display' is TRUE if this routine should definitely NOT try + * to invert things to see if that leads to a cleaner display. If + * FALSE, this routine is free to use its judgment about doing this. * * It returns TRUE if there was actually something output. (It may be that * the bitmap, etc is empty.) * * When called for outputting the bitmap of a non-ANYOF node, just pass the - * bitmap, with the succeeding parameters set to NULL. - * + * bitmap, with the succeeding parameters set to NULL, and the final one to + * FALSE. */ /* In general, it tries to display the 'cleanest' representation of the @@ -20051,7 +19989,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, * whether the class itself is to be inverted. However, there are some * cases where it can't try inverting, as what actually matches isn't known * until runtime, and hence the inversion isn't either. */ - bool inverting_allowed = TRUE; + bool inverting_allowed = ! force_as_is_display; int i; STRLEN orig_sv_cur = SvCUR(sv); @@ -20067,7 +20005,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, is UTF-8 */ SV* as_is_display; /* The output string when we take the inputs - literally */ + literally */ SV* inverted_display; /* The output string when we invert the inputs */ U8 flags = (node) ? ANYOF_FLAGS(node) : 0; @@ -20180,7 +20118,10 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, /* If have to take the output as-is, just do that */ if (! inverting_allowed) { - sv_catsv(sv, as_is_display); + if (as_is_display) { + sv_catsv(sv, as_is_display); + SvREFCNT_dec_NN(as_is_display); + } } else { /* But otherwise, create the output again on the inverted input, and use whichever version is shorter */ @@ -20216,10 +20157,13 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, _invlist_invert(only_utf8); _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8); } + else if (not_utf8) { - if (not_utf8) { - _invlist_invert(not_utf8); - _invlist_intersection(not_utf8, PL_UpperLatin1, ¬_utf8); + /* If a code point matches iff the target string is not in UTF-8, + * then complementing the result has it not match iff not in UTF-8, + * which is the same thing as matching iff it is UTF-8. */ + only_utf8 = not_utf8; + not_utf8 = NULL; } if (only_utf8_locale) { @@ -20238,17 +20182,19 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, /* Use the shortest representation, taking into account our bias * against showing it inverted */ - if (SvCUR(inverted_display) + inverted_bias - < SvCUR(as_is_display) + as_is_bias) + if ( inverted_display + && ( ! as_is_display + || ( SvCUR(inverted_display) + inverted_bias + < SvCUR(as_is_display) + as_is_bias))) { sv_catsv(sv, inverted_display); } - else { + else if (as_is_display) { sv_catsv(sv, as_is_display); } - SvREFCNT_dec_NN(as_is_display); - SvREFCNT_dec_NN(inverted_display); + SvREFCNT_dec(as_is_display); + SvREFCNT_dec(inverted_display); } SvREFCNT_dec_NN(invlist);