#define STATIC static
#endif
-#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#endif
-
-#ifndef MAX
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-#endif
-
/* this is a chain of data about sub patterns we are processing that
need to be handled separately/specially in study_chunk. Its so
we can simulate recursion without losing state. */
assert(is_ANYOF_SYNTHETIC(ssc));
- ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
- _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
+ /* mortalize so won't leak */
+ ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
}
: ( state==1 ? special : 0 ) \
)
+#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
+STMT_START { \
+ TRIE_BITMAP_SET(trie, uvc); \
+ /* store the folded codepoint */ \
+ if ( folder ) \
+ TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
+ \
+ if ( !UTF ) { \
+ /* store first byte of utf8 representation of */ \
+ /* variant codepoints */ \
+ if (! UVCHR_IS_INVARIANT(uvc)) { \
+ TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
+ } \
+ } \
+} STMT_END
#define MADE_TRIE 1
#define MADE_JUMP_TRIE 2
#define MADE_EXACT_TRIE 4
bitmap?*/
if (OP(noper) == NOTHING) {
+ /* skip past a NOTHING at the start of an alternation
+ * eg, /(?:)a|(?:b)/ should be the same as /a|b/
+ */
regnode *noper_next= regnext(noper);
if (noper_next < tail)
noper= noper_next;
}
- if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+ if ( noper < tail &&
+ (
+ OP(noper) == flags ||
+ (
+ flags == EXACTFU &&
+ OP(noper) == EXACTFU_SS
+ )
+ )
+ ) {
uc= (U8*)STRING(noper);
e= uc + STR_LEN(noper);
} else {
TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
}
}
+
for ( ; uc < e ; uc += len ) { /* Look at each char in the current
branch */
TRIE_CHARCOUNT(trie)++;
if ( set_bit ) {
/* store the codepoint in the bitmap, and its folded
* equivalent. */
- TRIE_BITMAP_SET(trie, uvc);
-
- /* store the folded codepoint */
- if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
-
- if ( !UTF ) {
- /* store first byte of utf8 representation of
- variant codepoints */
- if (! UVCHR_IS_INVARIANT(uvc)) {
- TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
- }
- }
+ TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
set_bit = 0; /* We've done our bit :-) */
}
} else {
split out as an EXACT and put in front of the TRIE node. */
trie->startstate= 1;
if ( trie->bitmap && !widecharmap && !trie->jump ) {
+ /* we want to find the first state that has more than
+ * one transition, if that state is not the first state
+ * then we have a common prefix which we can remove.
+ */
U32 state;
for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
U32 ofs = 0;
- I32 idx = -1;
+ I32 first_ofs = -1; /* keeps track of the ofs of the first
+ transition, -1 means none */
U32 count = 0;
const U32 base = trie->states[ state ].trans.base;
+ /* does this state terminate an alternation? */
if ( trie->states[state].wordnum )
count = 1;
trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
{
if ( ++count > 1 ) {
- SV **tmp = av_fetch( revcharmap, ofs, 0);
- const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
+ /* we have more than one transition */
+ SV **tmp;
+ U8 *ch;
+ /* if this is the first state there is no common prefix
+ * to extract, so we can exit */
if ( state == 1 ) break;
+ tmp = av_fetch( revcharmap, ofs, 0);
+ ch = (U8*)SvPV_nolen_const( *tmp );
+
+ /* if we are on count 2 then we need to initialize the
+ * bitmap, and store the previous char if there was one
+ * in it*/
if ( count == 2 ) {
+ /* clear the bitmap */
Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
DEBUG_OPTIMISE_r(
Perl_re_indentf( aTHX_ "New Start State=%"UVuf" Class: [",
depth+1,
(UV)state));
- if (idx >= 0) {
- SV ** const tmp = av_fetch( revcharmap, idx, 0);
+ if (first_ofs >= 0) {
+ SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
- TRIE_BITMAP_SET(trie,*ch);
- if ( folder )
- TRIE_BITMAP_SET(trie, folder[ *ch ]);
+ TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
DEBUG_OPTIMISE_r(
Perl_re_printf( aTHX_ "%s", (char*)ch)
);
}
}
- TRIE_BITMAP_SET(trie,*ch);
- if ( folder )
- TRIE_BITMAP_SET(trie,folder[ *ch ]);
+ /* store the current firstchar in the bitmap */
+ TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
}
- idx = ofs;
+ first_ofs = ofs;
}
}
if ( count == 1 ) {
- SV **tmp = av_fetch( revcharmap, idx, 0);
+ /* This state has only one transition, its transition is part
+ * of a common prefix - we need to concatenate the char it
+ * represents to what we have so far. */
+ SV **tmp = av_fetch( revcharmap, first_ofs, 0);
STRLEN len;
char *ch = SvPV( *tmp, len );
DEBUG_OPTIMISE_r({
SV *sv=sv_newmortal();
- Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
+ Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Ofs:%"UVuf" Char='%s'\n",
depth+1,
- (UV)state, (UV)idx,
+ (UV)state, (UV)first_ofs,
pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
PL_colors[0], PL_colors[1],
(SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
sv_catsv_nomg(pat, msv);
rx = msv;
}
- else
- pat = msv;
+ else {
+ /* We have only one SV to process, but we need to verify
+ * it is properly null terminated or we will fail asserts
+ * later. In theory we probably shouldn't get such SV's,
+ * but if we do we should handle it gracefully. */
+ if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) ) {
+ /* not a string, or a string with a trailing null */
+ pat = msv;
+ } else {
+ /* a string with no trailing null, we need to copy it
+ * so it we have a trailing null */
+ pat = newSVsv(msv);
+ }
+ }
if (code)
pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
* as an SVt_INVLIST scalar.
*
* An inversion list for Unicode is an array of code points, sorted by ordinal
- * number. The zeroth element is the first code point in the list. The 1th
- * element is the first element beyond that not in the list. In other words,
- * the first range is
- * invlist[0]..(invlist[1]-1)
- * The other ranges follow. Thus every element whose index is divisible by two
- * marks the beginning of a range that is in the list, and every element not
- * divisible by two marks the beginning of a range not in the list. A single
- * element inversion list that contains the single code point N generally
- * consists of two elements
- * invlist[0] == N
- * invlist[1] == N+1
- * (The exception is when N is the highest representable value on the
- * machine, in which case the list containing just it would be a single
- * element, itself. By extension, if the last range in the list extends to
- * infinity, then the first element of that range will be in the inversion list
- * at a position that is divisible by two, and is the final element in the
- * list.)
+ * number. Each element gives the code point that begins a range that extends
+ * up-to but not including the code point given by the next element. The final
+ * element gives the first code point of a range that extends to the platform's
+ * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
+ * ...) give ranges whose code points are all in the inversion list. We say
+ * that those ranges are in the set. The odd-numbered elements give ranges
+ * whose code points are not in the inversion list, and hence not in the set.
+ * Thus, element [0] is the first code point in the list. Element [1]
+ * is the first code point beyond that not in the list; and element [2] is the
+ * first code point beyond that that is in the list. In other words, the first
+ * range is invlist[0]..(invlist[1]-1), and all code points in that range are
+ * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
+ * all code points in that range are not in the inversion list. The third
+ * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
+ * list, and so forth. Thus every element whose index is divisible by two
+ * gives the beginning of a range that is in the list, and every element whose
+ * index is not divisible by two gives the beginning of a range not in the
+ * list. If the final element's index is divisible by two, the inversion list
+ * extends to the platform's infinity; otherwise the highest code point in the
+ * inversion list is the contents of that element minus 1.
+ *
+ * A range that contains just a single code point N will look like
+ * invlist[i] == N
+ * invlist[i+1] == N+1
+ *
+ * If N is UV_MAX (the highest representable code point on the machine), N+1 is
+ * impossible to represent, so element [i+1] is omitted. The single element
+ * inversion list
+ * invlist[0] == UV_MAX
+ * contains just UV_MAX, but is interpreted as matching to infinity.
+ *
* Taking the complement (inverting) an inversion list is quite simple, if the
* first element is 0, remove it; otherwise add a 0 element at the beginning.
* This implementation reserves an element at the beginning of each inversion
* list to always contain 0; there is an additional flag in the header which
* indicates if the list begins at the 0, or is offset to begin at the next
- * element.
+ * element. This means that the inversion list can be inverted without any
+ * copying; just flip the flag.
*
* More about inversion lists can be found in "Unicode Demystified"
* Chapter 13 by Richard Gillam, published by Addison-Wesley.
- * More will be coming when functionality is added later.
*
* The inversion list data structure is currently implemented as an SV pointing
* to an array of UVs that the SV thinks are bytes. This allows us to have an
/* The header definitions are in F<invlist_inline.h> */
+#ifndef PERL_IN_XSUB_RE
+
PERL_STATIC_INLINE UV*
S__invlist_array_init(SV* const invlist, const bool will_have_0)
{
return zero_addr + *offset;
}
+#endif
+
PERL_STATIC_INLINE void
S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
{
STATIC void
S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
{
- /* Replaces the inversion list in 'src' with the one in 'dest'. It steals
- * the list from 'src', so 'src' is made to have a NULL list. This is
- * similar to what SvSetMagicSV() would do, if it were implemented on
+ /* Replaces the inversion list in 'dest' with the one from 'src'. It
+ * steals the list from 'src', so 'src' is made to have a NULL list. This
+ * is similar to what SvSetMagicSV() would do, if it were implemented on
* inversion lists, though this routine avoids a copy */
const UV src_len = _invlist_len(src);
return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
}
+#ifndef PERL_IN_XSUB_RE
+
PERL_STATIC_INLINE UV
S_invlist_max(SV* const invlist)
{
? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
: FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
}
-
-#ifndef PERL_IN_XSUB_RE
SV*
Perl__new_invlist(pTHX_ IV initial_size)
{
return invlist;
}
-#endif /* ifndef PERL_IN_XSUB_RE */
STATIC void
S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
UV final_element = len - 1;
array = invlist_array(invlist);
- if (array[final_element] > start
+ if ( array[final_element] > start
|| ELEMENT_RANGE_MATCHES_INVLIST(final_element))
{
Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
}
- /* Here, it is a legal append. If the new range begins with the first
- * value not in the set, it is extending the set, so the new first
- * value not in the set is one greater than the newly extended range.
- * */
+ /* Here, it is a legal append. If the new range begins 1 above the end
+ * of the range below it, it is extending the range below it, so the
+ * new first value not in the set is one greater than the newly
+ * extended range. */
offset = *get_invlist_offset_addr(invlist);
if (array[final_element] == start) {
if (end != UV_MAX) {
}
else {
/* But if the end is the maximum representable on the machine,
- * just let the range that this would extend to have no end */
+ * assume that infinity was actually what was meant. Just let
+ * the range that this would extend to have no end */
invlist_set_len(invlist, len - 1, offset);
}
return;
}
}
-#ifndef PERL_IN_XSUB_RE
-
-IV
+SSize_t
Perl__invlist_search(SV* const invlist, const UV cp)
{
/* Searches the inversion list for the entry that contains the input code
Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
const bool complement_b, SV** output)
{
- /* Take the union of two inversion lists and point <output> to it. *output
- * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
- * the reference count to that list will be decremented if not already a
- * temporary (mortal); otherwise just its contents will be modified to be
- * the union. The first list, <a>, may be NULL, in which case a copy of
- * the second list is returned. If <complement_b> is TRUE, the union is
- * taken of the complement (inversion) of <b> instead of b itself.
+ /* Take the union of two inversion lists and point '*output' to it. On
+ * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
+ * even 'a' or 'b'). If to an inversion list, the contents of the original
+ * list will be replaced by the union. The first list, 'a', may be
+ * NULL, in which case a copy of the second list is placed in '*output'.
+ * If 'complement_b' is TRUE, the union is taken of the complement
+ * (inversion) of 'b' instead of b itself.
*
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
UV i_b = 0;
UV i_u = 0;
- bool has_something_from_a = FALSE;
- bool has_something_from_b = FALSE;
-
-
/* running count, as explained in the algorithm source book; items are
* stopped accumulating and are output when the count changes to/from 0.
- * The count is incremented when we start a range that's in the set, and
- * decremented when we start a range that's not in the set. So its range
- * is 0 to 2. Only when the count is zero is something not in the set.
- */
+ * The count is incremented when we start a range that's in an input's set,
+ * and decremented when we start a range that's not in a set. So this
+ * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
+ * and hence nothing goes into the union; 1, just one of the inputs is in
+ * its set (and its current range gets added to the union); and 2 when both
+ * inputs are in their sets. */
UV count = 0;
PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
+ assert(*output == NULL || SvTYPE(*output) == SVt_INVLIST);
len_b = _invlist_len(b);
if (len_b == 0) {
- /* Here, 'b' is empty. If the output is the complement of 'b', the
- * union is all possible code points, and we need not even look at 'a'.
- * It's easiest to create a new inversion list that matches everything.
- * */
+ /* Here, 'b' is empty, hence it's complement is all possible code
+ * points. So if the union includes the complement of 'b', it includes
+ * everything, and we need not even look at 'a'. It's easiest to
+ * create a new inversion list that matches everything. */
if (complement_b) {
- SV* everything = _new_invlist(1);
- _append_range_to_invlist(everything, 0, UV_MAX);
+ SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
- /* If the output didn't exist, just point it at the new list */
- if (*output == NULL) {
+ if (*output == NULL) { /* If the output didn't exist, just point it
+ at the new list */
*output = everything;
- return;
+ }
+ else { /* Otherwise, replace its contents with the new list */
+ invlist_replace_list_destroys_src(*output, everything);
+ SvREFCNT_dec_NN(everything);
}
- /* Otherwise, replace its contents with the new list */
- invlist_replace_list_destroys_src(*output, everything);
- SvREFCNT_dec_NN(everything);
return;
}
- /* Here, we don't want the complement of 'b', and since it is empty,
+ /* Here, we don't want the complement of 'b', and since 'b' is empty,
* the union will come entirely from 'a'. If 'a' is NULL or empty, the
* output will be empty */
- if (a == NULL) {
- *output = _new_invlist(0);
+ if (a == NULL || _invlist_len(a) == 0) {
+ if (*output == NULL) {
+ *output = _new_invlist(0);
+ }
+ else {
+ invlist_clear(*output);
+ }
return;
}
- if (_invlist_len(a) == 0) {
- invlist_clear(*output);
+ /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
+ * union. We can just return a copy of 'a' if '*output' doesn't point
+ * to an existing list */
+ if (*output == NULL) {
+ *output = invlist_clone(a);
return;
}
- /* Here, 'a' is not empty, and entirely determines the union. If the
- * output is not to overwrite 'b', we can just return 'a'. */
- if (*output != b) {
-
- /* If the output is to overwrite 'a', we have a no-op, as it's
- * already in 'a' */
- if (*output == a) {
- return;
- }
-
- /* But otherwise we have to copy 'a' to the output */
- *output = invlist_clone(a);
+ /* If the output is to overwrite 'a', we have a no-op, as it's
+ * already in 'a' */
+ if (*output == a) {
return;
}
- /* Here, 'b' is to be overwritten by the output, which will be 'a' */
+ /* Here, '*output' is to be overwritten by 'a' */
u = invlist_clone(a);
invlist_replace_list_destroys_src(*output, u);
SvREFCNT_dec_NN(u);
- return;
+ return;
}
+ /* Here 'b' is not empty. See about 'a' */
+
if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
/* Here, 'a' is empty (and b is not). That means the union will come
- * entirely from 'b'. If the output is not to overwrite 'a', we can
- * just return what's in 'b'. */
- if (*output != a) {
-
- /* If the output is to overwrite 'b', it's already in 'b', but
- * otherwise we have to copy 'b' to the output */
- if (*output != b) {
- *output = invlist_clone(b);
- }
+ * entirely from 'b'. If '*output' is NULL, we can directly return a
+ * clone of 'b'. Otherwise, we replace the contents of '*output' with
+ * the clone */
- /* And if the output is to be the inversion of 'b', do that */
- if (complement_b) {
- _invlist_invert(*output);
- }
-
- return;
+ SV ** dest = (*output == NULL) ? output : &u;
+ *dest = invlist_clone(b);
+ if (complement_b) {
+ _invlist_invert(*dest);
}
- /* Here, 'a', which is empty or even NULL, is to be overwritten by the
- * output, which will either be 'b' or the complement of 'b' */
-
- if (a == NULL) {
- *output = invlist_clone(b);
- }
- else {
- u = invlist_clone(b);
+ if (dest == &u) {
invlist_replace_list_destroys_src(*output, u);
SvREFCNT_dec_NN(u);
- }
-
- if (complement_b) {
- _invlist_invert(*output);
}
return;
u = _new_invlist(len_a + len_b);
/* Will contain U+0000 if either component does */
- array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
- || (len_b > 0 && array_b[0] == 0));
+ array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
+ || (len_b > 0 && array_b[0] == 0));
- /* Go through each list item by item, stopping when exhausted one of
- * them */
+ /* Go through each input list item by item, stopping when have exhausted
+ * one of them */
while (i_a < len_a && i_b < len_b) {
UV cp; /* The element to potentially add to the union's array */
bool cp_in_set; /* is it in the the input list's set or not */
/* We need to take one or the other of the two inputs for the union.
* Since we are merging two sorted lists, we take the smaller of the
- * next items. In case of a tie, we take the one that is in its set
- * first. If we took one not in the set first, it would decrement the
- * count, possibly to 0 which would cause it to be output as ending the
- * range, and the next time through we would take the same number, and
- * output it again as beginning the next range. By doing it the
- * opposite way, there is no possibility that the count will be
- * momentarily decremented to 0, and thus the two adjoining ranges will
- * be seamlessly merged. (In a tie and both are in the set or both not
- * in the set, it doesn't matter which we take first.) */
- if (array_a[i_a] < array_b[i_b]
- || (array_a[i_a] == array_b[i_b]
+ * next items. In case of a tie, we take first the one that is in its
+ * set. If we first took the one not in its set, it would decrement
+ * the count, possibly to 0 which would cause it to be output as ending
+ * the range, and the next time through we would take the same number,
+ * and output it again as beginning the next range. By doing it the
+ * opposite way, there is no possibility that the count will be
+ * momentarily decremented to 0, and thus the two adjoining ranges will
+ * be seamlessly merged. (In a tie and both are in the set or both not
+ * in the set, it doesn't matter which we take first.) */
+ if ( array_a[i_a] < array_b[i_b]
+ || ( array_a[i_a] == array_b[i_b]
&& ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
{
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
- cp= array_a[i_a++];
- has_something_from_a = TRUE;
+ cp = array_a[i_a++];
}
else {
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
cp = array_b[i_b++];
- has_something_from_b = TRUE;
}
/* Here, have chosen which of the two inputs to look at. Only output
}
}
- /* Here, we are finished going through at least one of the lists, which
- * means there is something remaining in at most one. We check if the list
- * that hasn't been exhausted is positioned such that we are in the middle
- * of a range in its set or not. (i_a and i_b point to the element beyond
- * the one we care about.) If in the set, we decrement 'count'; if 0, there
- * is potentially more to output.
- * There are four cases:
- * 1) Both weren't in their sets, count is 0, and remains 0. What's left
- * in the union is entirely from the non-exhausted set.
- * 2) Both were in their sets, count is 2. Nothing further should
- * be output, as everything that remains will be in the exhausted
- * list's set, hence in the union; decrementing to 1 but not 0 insures
- * that
- * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
- * Nothing further should be output because the union includes
- * everything from the exhausted set. Not decrementing ensures that.
- * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
- * decrementing to 0 insures that we look at the remainder of the
- * non-exhausted set */
+
+ /* The loop above increments the index into exactly one of the input lists
+ * each iteration, and ends when either index gets to its list end. That
+ * means the other index is lower than its end, and so something is
+ * remaining in that one. We decrement 'count', as explained below, if
+ * that list is in its set. (i_a and i_b each currently index the element
+ * beyond the one we care about.) */
if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
|| (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
{
count--;
}
- /* The final length is what we've output so far, plus what else is about to
- * be output. (If 'count' is non-zero, then the input list we exhausted
- * has everything remaining up to the machine's limit in its set, and hence
- * in the union, so there will be no further output. */
+ /* Above we decremented 'count' if the list that had unexamined elements in
+ * it was in its set. This has made it so that 'count' being non-zero
+ * means there isn't anything left to output; and 'count' equal to 0 means
+ * that what is left to output is precisely that which is left in the
+ * non-exhausted input list.
+ *
+ * To see why, note first that the exhausted input obviously has nothing
+ * left to add to the union. If it was in its set at its end, that means
+ * the set extends from here to the platform's infinity, and hence so does
+ * the union and the non-exhausted set is irrelevant. The exhausted set
+ * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
+ * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
+ * 'count' remains at 1. This is consistent with the decremented 'count'
+ * != 0 meaning there's nothing left to add to the union.
+ *
+ * But if the exhausted input wasn't in its set, it contributed 0 to
+ * 'count', and the rest of the union will be whatever the other input is.
+ * If 'count' was 0, neither list was in its set, and 'count' remains 0;
+ * otherwise it gets decremented to 0. This is consistent with 'count'
+ * == 0 meaning the remainder of the union is whatever is left in the
+ * non-exhausted list. */
if (count != 0) {
-
- /* Here, there is nothing left to put in the union. If the union came
- * only from the input that it is to overwrite, this whole operation is
- * a no-op */
- if ( UNLIKELY(! has_something_from_b && *output == a)
- || UNLIKELY(! has_something_from_a && *output == b))
- {
- SvREFCNT_dec_NN(u);
- return;
- }
-
len_u = i_u;
}
else {
- /* When 'count' is 0, the list that was exhausted (if one was shorter
- * than the other) ended with everything above it not in its set. That
- * means that the remaining part of the union is precisely the same as
- * the non-exhausted list, so can just copy it unchanged. If only one
- * of the inputs contributes to the union, and the output is to
- * overwite that particular input, then this whole operation was a
- * no-op. */
-
IV copy_count = len_a - i_a;
- if (copy_count > 0) {
- if (UNLIKELY(! has_something_from_b && *output == a)) {
- SvREFCNT_dec_NN(u);
- return;
- }
+ if (copy_count > 0) { /* The non-exhausted input is 'a' */
Copy(array_a + i_a, array_u + i_u, copy_count, UV);
- len_u = i_u + copy_count;
}
- else if ((copy_count = len_b - i_b) > 0) {
- if (UNLIKELY(! has_something_from_a && *output == b)) {
- SvREFCNT_dec_NN(u);
- return;
- }
+ else { /* The non-exhausted input is b */
+ copy_count = len_b - i_b;
Copy(array_b + i_b, array_u + i_u, copy_count, UV);
- len_u = i_u + copy_count;
- } else if ( UNLIKELY(! has_something_from_b && *output == a)
- || UNLIKELY(! has_something_from_a && *output == b))
- {
- /* Here, both arrays are exhausted, so no need to do any additional
- * copying. Also here, the union came only from the input that it is
- * to overwrite, so this whole operation is a no-op */
- SvREFCNT_dec_NN(u);
- return;
}
+ len_u = i_u + copy_count;
}
/* Set the result to the final length, which can change the pointer to
array_u = invlist_array(u);
}
- /* If the output is not to overwrite either of the inputs, just return the
- * calculated union */
- if (a != *output && b != *output) {
+ if (*output == NULL) { /* Simply return the new inversion list */
*output = u;
}
else {
- /* Here, the output is to be the same as one of the input scalars,
- * hence replacing it. The simple thing to do is to free the input
- * scalar, making it instead be the output one. But experience has
- * shown [perl #127392] that if the input is a mortal, we can get a
- * huge build-up of these during regex compilation before they get
- * freed. So for that case, replace just the input's interior with
- * the output's, and then free the output */
-
- assert(! invlist_is_iterating(*output));
-
- if (! SvTEMP(*output)) {
- SvREFCNT_dec_NN(*output);
- *output = u;
- }
- else {
- invlist_replace_list_destroys_src(*output, u);
- SvREFCNT_dec_NN(u);
- }
+ /* Otherwise, overwrite the inversion list that was in '*output'. We
+ * could instead free '*output', and then set it to 'u', but experience
+ * has shown [perl #127392] that if the input is a mortal, we can get a
+ * huge build-up of these during regex compilation before they get
+ * freed. */
+ invlist_replace_list_destroys_src(*output, u);
+ SvREFCNT_dec_NN(u);
}
return;
Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
const bool complement_b, SV** i)
{
- /* Take the intersection of two inversion lists and point <i> to it. *i
- * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
- * the reference count to that list will be decremented if not already a
- * temporary (mortal); otherwise just its contents will be modified to be
- * the intersection. The first list, <a>, may be NULL, in which case an
- * empty list is returned. If <complement_b> is TRUE, the result will be
- * the intersection of <a> and the complement (or inversion) of <b> instead
- * of <b> directly.
+ /* Take the intersection of two inversion lists and point '*i' to it. On
+ * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
+ * even 'a' or 'b'). If to an inversion list, the contents of the original
+ * list will be replaced by the intersection. The first list, 'a', may be
+ * NULL, in which case '*i' will be an empty list. If 'complement_b' is
+ * TRUE, the result will be the intersection of 'a' and the complement (or
+ * inversion) of 'b' instead of 'b' directly.
*
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
UV i_b = 0;
UV i_r = 0;
- /* running count, as explained in the algorithm source book; items are
- * stopped accumulating and are output when the count changes to/from 2.
- * The count is incremented when we start a range that's in the set, and
- * decremented when we start a range that's not in the set. So its range
- * is 0 to 2. Only when the count is 2 is something in the intersection.
- */
+ /* running count of how many of the two inputs are postitioned at ranges
+ * that are in their sets. As explained in the algorithm source book,
+ * items are stopped accumulating and are output when the count changes
+ * to/from 2. The count is incremented when we start a range that's in an
+ * input's set, and decremented when we start a range that's not in a set.
+ * Only when it is 2 are we in the intersection. */
UV count = 0;
- bool has_something_from_a = FALSE;
- bool has_something_from_b = FALSE;
-
PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
+ assert(*i == NULL || SvTYPE(*i) == SVt_INVLIST);
/* Special case if either one is empty */
len_a = (a == NULL) ? 0 : _invlist_len(a);
return;
}
- /* If not overwriting either input, just make a copy of 'a' */
- if (*i != b) {
+ if (*i == NULL) {
*i = invlist_clone(a);
return;
}
- /* Here we are overwriting 'b' with 'a's contents */
r = invlist_clone(a);
invlist_replace_list_destroys_src(*i, r);
SvREFCNT_dec_NN(r);
r= _new_invlist(len_a + len_b);
/* Will contain U+0000 iff both components do */
- array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
- && len_b > 0 && array_b[0] == 0);
+ array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
+ && len_b > 0 && array_b[0] == 0);
- /* Go through each list item by item, stopping when exhausted one of
+ /* Go through each list item by item, stopping when have exhausted one of
* them */
while (i_a < len_a && i_b < len_b) {
UV cp; /* The element to potentially add to the intersection's
/* We need to take one or the other of the two inputs for the
* intersection. Since we are merging two sorted lists, we take the
- * smaller of the next items. In case of a tie, we take the one that
- * is not in its set first (a difference from the union algorithm). If
- * we took one in the set first, it would increment the count, possibly
- * to 2 which would cause it to be output as starting a range in the
- * intersection, and the next time through we would take that same
- * number, and output it again as ending the set. By doing it the
- * opposite of this, there is no possibility that the count will be
- * momentarily incremented to 2. (In a tie and both are in the set or
- * both not in the set, it doesn't matter which we take first.) */
- if (array_a[i_a] < array_b[i_b]
- || (array_a[i_a] == array_b[i_b]
+ * smaller of the next items. In case of a tie, we take first the one
+ * that is not in its set (a difference from the union algorithm). If
+ * we first took the one in its set, it would increment the count,
+ * possibly to 2 which would cause it to be output as starting a range
+ * in the intersection, and the next time through we would take that
+ * same number, and output it again as ending the set. By doing the
+ * opposite of this, there is no possibility that the count will be
+ * momentarily incremented to 2. (In a tie and both are in the set or
+ * both not in the set, it doesn't matter which we take first.) */
+ if ( array_a[i_a] < array_b[i_b]
+ || ( array_a[i_a] == array_b[i_b]
&& ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
{
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
- cp= array_a[i_a++];
- has_something_from_a = TRUE;
+ cp = array_a[i_a++];
}
else {
cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
cp= array_b[i_b++];
- has_something_from_b = TRUE;
}
/* Here, have chosen which of the two inputs to look at. Only output
}
count--;
}
+
}
- /* Here, we are finished going through at least one of the lists, which
- * means there is something remaining in at most one. We check if the list
- * that has been exhausted is positioned such that we are in the middle
- * of a range in its set or not. (i_a and i_b point to elements 1 beyond
- * the ones we care about.) There are four cases:
- * 1) Both weren't in their sets, count is 0, and remains 0. There's
- * nothing left in the intersection.
- * 2) Both were in their sets, count is 2 and perhaps is incremented to
- * above 2. What should be output is exactly that which is in the
- * non-exhausted set, as everything it has is also in the intersection
- * set, and everything it doesn't have can't be in the intersection
- * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
- * gets incremented to 2. Like the previous case, the intersection is
- * everything that remains in the non-exhausted set.
- * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
- * remains 1. And the intersection has nothing more. */
+ /* The loop above increments the index into exactly one of the input lists
+ * each iteration, and ends when either index gets to its list end. That
+ * means the other index is lower than its end, and so something is
+ * remaining in that one. We increment 'count', as explained below, if the
+ * exhausted list was in its set. (i_a and i_b each currently index the
+ * element beyond the one we care about.) */
if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
- || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
+ || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
{
count++;
}
- if (count < 2) {
-
- /* Here, there is nothing left to put in the intersection. If the
- * intersection came only from the input that it is to overwrite, this
- * whole operation is a no-op */
- if ( UNLIKELY(! has_something_from_b && *i == a)
- || UNLIKELY(! has_something_from_a && *i == b))
- {
- SvREFCNT_dec_NN(r);
- return;
- }
-
+ /* Above we incremented 'count' if the exhausted list was in its set. This
+ * has made it so that 'count' being below 2 means there is nothing left to
+ * output; otheriwse what's left to add to the intersection is precisely
+ * that which is left in the non-exhausted input list.
+ *
+ * To see why, note first that the exhausted input obviously has nothing
+ * left to affect the intersection. If it was in its set at its end, that
+ * means the set extends from here to the platform's infinity, and hence
+ * anything in the non-exhausted's list will be in the intersection, and
+ * anything not in it won't be. Hence, the rest of the intersection is
+ * precisely what's in the non-exhausted list The exhausted set also
+ * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
+ * it means 'count' is now at least 2. This is consistent with the
+ * incremented 'count' being >= 2 means to add the non-exhausted list to
+ * the intersection.
+ *
+ * But if the exhausted input wasn't in its set, it contributed 0 to
+ * 'count', and the intersection can't include anything further; the
+ * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
+ * incremented. This is consistent with 'count' being < 2 meaning nothing
+ * further to add to the intersection. */
+ if (count < 2) { /* Nothing left to put in the intersection. */
len_r = i_r;
}
- else {
- /* When 'count' is 2 or more, the list that was exhausted, what remains
- * in the intersection is precisely the same as the non-exhausted list,
- * so can just copy it unchanged. If only one of the inputs
- * contributes to the intersection, and the output is to overwite that
- * particular input, then this whole operation was a no-op. */
-
+ else { /* copy the non-exhausted list, unchanged. */
IV copy_count = len_a - i_a;
- if (copy_count > 0) {
- if (UNLIKELY(! has_something_from_b && *i == a)) {
- SvREFCNT_dec_NN(r);
- return;
- }
+ if (copy_count > 0) { /* a is the one with stuff left */
Copy(array_a + i_a, array_r + i_r, copy_count, UV);
- len_r = i_r + copy_count;
}
- else if ((copy_count = len_b - i_b) > 0) {
- if (UNLIKELY(! has_something_from_a && *i == b)) {
- SvREFCNT_dec_NN(r);
- return;
- }
+ else { /* b is the one with stuff left */
+ copy_count = len_b - i_b;
Copy(array_b + i_b, array_r + i_r, copy_count, UV);
- len_r = i_r + copy_count;
- } else if ( UNLIKELY(! has_something_from_b && *i == a)
- || UNLIKELY(! has_something_from_a && *i == b))
- {
- /* Here, both arrays are exhausted, so no need to do any additional
- * copying. Also here, the intersection came only from the input
- * that it is to overwrite, so this whole operation is a no-op */
- SvREFCNT_dec_NN(r);
- return;
}
+ len_r = i_r + copy_count;
}
/* Set the result to the final length, which can change the pointer to
array_r = invlist_array(r);
}
- /* If the output is not to overwrite either of the inputs, just return the
- * calculated intersection */
- if (a != *i && b != *i) {
+ if (*i == NULL) { /* Simply return the calculated intersection */
*i = r;
}
- else {
- /* Here, the output is to be the same as one of the input scalars,
- * hence replacing it. The simple thing to do is to free the input
- * scalar, making it instead be the output one. But experience has
- * shown [perl #127392] that if the input is a mortal, we can get a
- * huge build-up of these during regex compilation before they get
- * freed. So for that case, replace just the input's interior with
- * the output's, and then free the output. A short-cut in this case
- * is if the output is empty, we can just set the input to be empty */
-
- assert(! invlist_is_iterating(*i));
-
- if (! SvTEMP(*i)) {
- SvREFCNT_dec_NN(*i);
- *i = r;
+ else { /* Otherwise, replace the existing inversion list in '*i'. We could
+ instead free '*i', and then set it to 'r', but experience has
+ shown [perl #127392] that if the input is a mortal, we can get a
+ huge build-up of these during regex compilation before they get
+ freed. */
+ if (len_r) {
+ invlist_replace_list_destroys_src(*i, r);
}
else {
- if (len_r) {
- invlist_replace_list_destroys_src(*i, r);
- }
- else {
- invlist_clear(*i);
- }
- SvREFCNT_dec_NN(r);
+ invlist_clear(*i);
}
+ SvREFCNT_dec_NN(r);
}
return;
}
SV*
-Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
+Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
{
/* Add the range from 'start' to 'end' inclusive to the inversion list's
* set. A pointer to the inversion list is returned. This may actually be
* a new list, in which case the passed in one has been destroyed. The
* passed-in inversion list can be NULL, in which case a new one is created
- * with just the one range in it */
-
- SV* range_invlist;
- UV len;
-
+ * with just the one range in it. The new list is not necessarily
+ * NUL-terminated. Space is not freed if the inversion list shrinks as a
+ * result of this function. The gain would not be large, and in many
+ * cases, this is called multiple times on a single inversion list, so
+ * anything freed may almost immediately be needed again.
+ *
+ * This used to mostly call the 'union' routine, but that is much more
+ * heavyweight than really needed for a single range addition */
+
+ UV* array; /* The array implementing the inversion list */
+ UV len; /* How many elements in 'array' */
+ SSize_t i_s; /* index into the invlist array where 'start'
+ should go */
+ SSize_t i_e = 0; /* And the index where 'end' should go */
+ UV cur_highest; /* The highest code point in the inversion list
+ upon entry to this function */
+
+ /* This range becomes the whole inversion list if none already existed */
if (invlist == NULL) {
invlist = _new_invlist(2);
- len = 0;
+ _append_range_to_invlist(invlist, start, end);
+ return invlist;
}
- else {
- len = _invlist_len(invlist);
+
+ /* Likewise, if the inversion list is currently empty */
+ len = _invlist_len(invlist);
+ if (len == 0) {
+ _append_range_to_invlist(invlist, start, end);
+ return invlist;
}
- /* If comes after the final entry actually in the list, can just append it
- * to the end, */
- if (len == 0
- || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
- && start >= invlist_array(invlist)[len - 1]))
- {
- _append_range_to_invlist(invlist, start, end);
- return invlist;
+ /* Starting here, we have to know the internals of the list */
+ array = invlist_array(invlist);
+
+ /* If the new range ends higher than the current highest ... */
+ cur_highest = invlist_highest(invlist);
+ if (end > cur_highest) {
+
+ /* If the whole range is higher, we can just append it */
+ if (start > cur_highest) {
+ _append_range_to_invlist(invlist, start, end);
+ return invlist;
+ }
+
+ /* Otherwise, add the portion that is higher ... */
+ _append_range_to_invlist(invlist, cur_highest + 1, end);
+
+ /* ... and continue on below to handle the rest. As a result of the
+ * above append, we know that the index of the end of the range is the
+ * final even numbered one of the array. Recall that the final element
+ * always starts a range that extends to infinity. If that range is in
+ * the set (meaning the set goes from here to infinity), it will be an
+ * even index, but if it isn't in the set, it's odd, and the final
+ * range in the set is one less, which is even. */
+ if (end == UV_MAX) {
+ i_e = len;
+ }
+ else {
+ i_e = len - 2;
+ }
+ }
+
+ /* We have dealt with appending, now see about prepending. If the new
+ * range starts lower than the current lowest ... */
+ if (start < array[0]) {
+
+ /* Adding something which has 0 in it is somewhat tricky, and uncommon.
+ * Let the union code handle it, rather than having to know the
+ * trickiness in two code places. */
+ if (UNLIKELY(start == 0)) {
+ SV* range_invlist;
+
+ range_invlist = _new_invlist(2);
+ _append_range_to_invlist(range_invlist, start, end);
+
+ _invlist_union(invlist, range_invlist, &invlist);
+
+ SvREFCNT_dec_NN(range_invlist);
+
+ return invlist;
+ }
+
+ /* If the whole new range comes before the first entry, and doesn't
+ * extend it, we have to insert it as an additional range */
+ if (end < array[0] - 1) {
+ i_s = i_e = -1;
+ goto splice_in_new_range;
+ }
+
+ /* Here the new range adjoins the existing first range, extending it
+ * downwards. */
+ array[0] = start;
+
+ /* And continue on below to handle the rest. We know that the index of
+ * the beginning of the range is the first one of the array */
+ i_s = 0;
+ }
+ else { /* Not prepending any part of the new range to the existing list.
+ * Find where in the list it should go. This finds i_s, such that:
+ * invlist[i_s] <= start < array[i_s+1]
+ */
+ i_s = _invlist_search(invlist, start);
+ }
+
+ /* At this point, any extending before the beginning of the inversion list
+ * and/or after the end has been done. This has made it so that, in the
+ * code below, each endpoint of the new range is either in a range that is
+ * in the set, or is in a gap between two ranges that are. This means we
+ * don't have to worry about exceeding the array bounds.
+ *
+ * Find where in the list the new range ends (but we can skip this if we
+ * have already determined what it is, or if it will be the same as i_s,
+ * which we already have computed) */
+ if (i_e == 0) {
+ i_e = (start == end)
+ ? i_s
+ : _invlist_search(invlist, end);
+ }
+
+ /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
+ * is a range that goes to infinity there is no element at invlist[i_e+1],
+ * so only the first relation holds. */
+
+ if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
+
+ /* Here, the ranges on either side of the beginning of the new range
+ * are in the set, and this range starts in the gap between them.
+ *
+ * The new range extends the range above it downwards if the new range
+ * ends at or above that range's start */
+ const bool extends_the_range_above = ( end == UV_MAX
+ || end + 1 >= array[i_s+1]);
+
+ /* The new range extends the range below it upwards if it begins just
+ * after where that range ends */
+ if (start == array[i_s]) {
+
+ /* If the new range fills the entire gap between the other ranges,
+ * they will get merged together. Other ranges may also get
+ * merged, depending on how many of them the new range spans. In
+ * the general case, we do the merge later, just once, after we
+ * figure out how many to merge. But in the case where the new
+ * range exactly spans just this one gap (possibly extending into
+ * the one above), we do the merge here, and an early exit. This
+ * is done here to avoid having to special case later. */
+ if (i_e - i_s <= 1) {
+
+ /* If i_e - i_s == 1, it means that the new range terminates
+ * within the range above, and hence 'extends_the_range_above'
+ * must be true. (If the range above it extends to infinity,
+ * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
+ * will be 0, so no harm done.) */
+ if (extends_the_range_above) {
+ Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
+ invlist_set_len(invlist,
+ len - 2,
+ *(get_invlist_offset_addr(invlist)));
+ return invlist;
+ }
+
+ /* Here, i_e must == i_s. We keep them in sync, as they apply
+ * to the same range, and below we are about to decrement i_s
+ * */
+ i_e--;
+ }
+
+ /* Here, the new range is adjacent to the one below. (It may also
+ * span beyond the range above, but that will get resolved later.)
+ * Extend the range below to include this one. */
+ array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
+ i_s--;
+ start = array[i_s];
+ }
+ else if (extends_the_range_above) {
+
+ /* Here the new range only extends the range above it, but not the
+ * one below. It merges with the one above. Again, we keep i_e
+ * and i_s in sync if they point to the same range */
+ if (i_e == i_s) {
+ i_e++;
+ }
+ i_s++;
+ array[i_s] = start;
+ }
+ }
+
+ /* Here, we've dealt with the new range start extending any adjoining
+ * existing ranges.
+ *
+ * If the new range extends to infinity, it is now the final one,
+ * regardless of what was there before */
+ if (UNLIKELY(end == UV_MAX)) {
+ invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
+ return invlist;
}
- /* Here, can't just append things, create and return a new inversion list
- * which is the union of this range and the existing inversion list. (If
- * the new range is well-behaved wrt to the old one, we could just insert
- * it, doing a Move() down on the tail of the old one (potentially growing
- * it first). But to determine that means we would have the extra
- * (possibly throw-away) work of first finding where the new one goes and
- * whether it disrupts (splits) an existing range, so it doesn't appear to
- * me (khw) that it's worth it) */
- range_invlist = _new_invlist(2);
- _append_range_to_invlist(range_invlist, start, end);
+ /* If i_e started as == i_s, it has also been dealt with,
+ * and been updated to the new i_s, which will fail the following if */
+ if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
- _invlist_union(invlist, range_invlist, &invlist);
+ /* Here, the ranges on either side of the end of the new range are in
+ * the set, and this range ends in the gap between them.
+ *
+ * If this range is adjacent to (hence extends) the range above it, it
+ * becomes part of that range; likewise if it extends the range below,
+ * it becomes part of that range */
+ if (end + 1 == array[i_e+1]) {
+ i_e++;
+ array[i_e] = start;
+ }
+ else if (start <= array[i_e]) {
+ array[i_e] = end + 1;
+ i_e--;
+ }
+ }
- /* The temporary can be freed */
- SvREFCNT_dec_NN(range_invlist);
+ if (i_s == i_e) {
+
+ /* If the range fits entirely in an existing range (as possibly already
+ * extended above), it doesn't add anything new */
+ if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
+ return invlist;
+ }
+
+ /* Here, no part of the range is in the list. Must add it. It will
+ * occupy 2 more slots */
+ splice_in_new_range:
+
+ invlist_extend(invlist, len + 2);
+ array = invlist_array(invlist);
+ /* Move the rest of the array down two slots. Don't include any
+ * trailing NUL */
+ Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
+
+ /* Do the actual splice */
+ array[i_e+1] = start;
+ array[i_e+2] = end + 1;
+ invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
+ return invlist;
+ }
+
+ /* Here the new range crossed the boundaries of a pre-existing range. The
+ * code above has adjusted things so that both ends are in ranges that are
+ * in the set. This means everything in between must also be in the set.
+ * Just squash things together */
+ Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
+ invlist_set_len(invlist,
+ len - i_e + i_s,
+ *(get_invlist_offset_addr(invlist)));
return invlist;
}
PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
- _append_range_to_invlist(invlist, element0, element0);
+ invlist = add_cp_to_invlist(invlist, element0);
offset = *get_invlist_offset_addr(invlist);
invlist_set_len(invlist, size, offset);
RExC_seen |= REG_LOOKBEHIND_SEEN;
RExC_in_lookbehind++;
RExC_parse++;
- assert(RExC_parse < RExC_end);
+ if (RExC_parse >= RExC_end) {
+ vFAIL("Sequence (?... not terminated");
+ }
+
/* FALLTHROUGH */
case '=': /* (?=...) */
RExC_seen_zerolen++;
FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
}
else if (ret == NULL)
- ret = latest;
+ ret = latest;
*flagp |= flags&(HASWIDTH|POSTPONED);
if (chain == NULL) /* First piece. */
*flagp |= flags&SPSTART;
}
-/*
- * reg_recode
- *
- * It returns the code point in utf8 for the value in *encp.
- * value: a code value in the source encoding
- * encp: a pointer to an Encode object
- *
- * If the result from Encode is not a single character,
- * it returns U+FFFD (Replacement character) and sets *encp to NULL.
- */
-STATIC UV
-S_reg_recode(pTHX_ const U8 value, SV **encp)
-{
- STRLEN numlen = 1;
- SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
- const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
- const STRLEN newlen = SvCUR(sv);
- UV uv = UNICODE_REPLACEMENT;
-
- PERL_ARGS_ASSERT_REG_RECODE;
-
- if (newlen)
- uv = SvUTF8(sv)
- ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
- : *(U8*)s;
-
- if (!newlen || numlen != newlen) {
- uv = UNICODE_REPLACEMENT;
- *encp = NULL;
- }
- return uv;
-}
-
PERL_STATIC_INLINE U8
S_compute_EXACTish(RExC_state_t *pRExC_state)
{
/*
- regatom - the lowest level
- Try to identify anything special at the start of the pattern. If there
- is, then handle it as required. This may involve generating a single regop,
- such as for an assertion; or it may involve recursing, such as to
- handle a () structure.
+ Try to identify anything special at the start of the current parse position.
+ If there is, then handle it as required. This may involve generating a
+ single regop, such as for an assertion; or it may involve recursing, such as
+ to handle a () structure.
If the string doesn't start with something special then we gobble up
- as much literal text as we can.
+ as much literal text as we can. If we encounter a quantifier, we have to
+ back off the final literal character, as that quantifier applies to just it
+ and not to the whole string of literals.
Once we have been able to handle whatever type of thing started the
sequence, we return.
|| UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
|| UTF8_IS_START(UCHARAT(RExC_parse)));
+ /* Here, we have a literal character. Find the maximal string of
+ * them in the input that we can fit into a single EXACTish node.
+ * We quit at the first non-literal or when the node gets full */
for (p = RExC_parse;
len < upper_parse && p < RExC_end;
len++)
vFAIL(error_msg);
}
ender = result;
- if (IN_ENCODING && ender < 0x100) {
- goto recode_encoding;
- }
if (ender > 0xff) {
REQUIRE_UTF8(flagp);
}
if (RExC_recode_x_to_native) {
ender = LATIN1_TO_NATIVE(ender);
}
- else
#endif
- if (IN_ENCODING) {
- goto recode_encoding;
- }
}
else {
REQUIRE_UTF8(flagp);
form_short_octal_warning(p, numlen));
}
}
- if (IN_ENCODING && ender < 0x100)
- goto recode_encoding;
- break;
- recode_encoding:
- if (! RExC_override_recoding) {
- SV* enc = _get_encoding();
- ender = reg_recode((U8)ender, &enc);
- if (!enc && PASS2)
- ckWARNreg(p, "Invalid escape in the specified encoding");
- REQUIRE_UTF8(flagp);
- }
break;
case '\0':
if (p >= RExC_end)
* the node, close the node with just them, and set up to do
* this character again next time through, when it will be the
* only thing in its new node */
+
if ((next_is_quantifier = ( LIKELY(p < RExC_end)
&& UNLIKELY(ISMULT2(p))))
&& LIKELY(len))
redo_curchar:
+#ifdef ENABLE_REGEX_SETS_DEBUGGING
+ /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */
+ DEBUG_U(dump_regex_sets_structures(pRExC_state,
+ stack, fence, fence_stack));
+#endif
+
top_index = av_tindex_nomg(stack);
switch (curchar) {
}
/* Stack the position of this undealt-with left paren */
- fence = top_index + 1;
av_push(fence_stack, newSViv(fence));
+ fence = top_index + 1;
break;
case '\\':
vFAIL("Unexpected ')'");
}
- /* If at least two thing on the stack, treat this as an
+ /* If nothing after the fence, is missing an operand */
+ if (top_index - fence < 0) {
+ RExC_parse++;
+ goto bad_syntax;
+ }
+ /* If at least two things on the stack, treat this as an
* operator */
if (top_index - fence >= 1) {
goto join_operators;
{
SV* i = NULL;
SV* u = NULL;
- SV* element;
_invlist_union(lhs, rhs, &u);
_invlist_intersection(lhs, rhs, &i);
- /* _invlist_subtract will overwrite rhs
- without freeing what it already contains */
- element = rhs;
_invlist_subtract(u, i, &rhs);
SvREFCNT_dec_NN(i);
SvREFCNT_dec_NN(u);
- SvREFCNT_dec_NN(element);
break;
}
}
Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
return node;
}
+
+#ifdef ENABLE_REGEX_SETS_DEBUGGING
+
+STATIC void
+S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state,
+ AV * stack, const IV fence, AV * fence_stack)
+{ /* Dumps the stacks in handle_regex_sets() */
+
+ const SSize_t stack_top = av_tindex_nomg(stack);
+ const SSize_t fence_stack_top = av_tindex_nomg(fence_stack);
+ SSize_t i;
+
+ PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES;
+
+ PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse);
+
+ if (stack_top < 0) {
+ PerlIO_printf(Perl_debug_log, "Nothing on stack\n");
+ }
+ else {
+ PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence);
+ for (i = stack_top; i >= 0; i--) {
+ SV ** element_ptr = av_fetch(stack, i, FALSE);
+ if (! element_ptr) {
+ }
+
+ if (IS_OPERATOR(*element_ptr)) {
+ PerlIO_printf(Perl_debug_log, "[%d]: %c\n",
+ (int) i, (int) SvIV(*element_ptr));
+ }
+ else {
+ PerlIO_printf(Perl_debug_log, "[%d] ", (int) i);
+ sv_dump(*element_ptr);
+ }
+ }
+ }
+
+ if (fence_stack_top < 0) {
+ PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n");
+ }
+ else {
+ PerlIO_printf(Perl_debug_log, "Fence_stack: \n");
+ for (i = fence_stack_top; i >= 0; i--) {
+ SV ** element_ptr = av_fetch(fence_stack, i, FALSE);
+ if (! element_ptr) {
+ }
+
+ PerlIO_printf(Perl_debug_log, "[%d]: %d\n",
+ (int) i, (int) SvIV(*element_ptr));
+ }
+ }
+}
+
+#endif
+
#undef IS_OPERATOR
#undef IS_OPERAND
}
}
non_portable_endpoint++;
- if (IN_ENCODING && value < 0x100) {
- goto recode_encoding;
- }
break;
case 'x':
RExC_parse--; /* function expects to be pointed at the 'x' */
}
}
non_portable_endpoint++;
- if (IN_ENCODING && value < 0x100)
- goto recode_encoding;
break;
case 'c':
value = grok_bslash_c(*RExC_parse++, PASS2);
}
}
non_portable_endpoint++;
- if (IN_ENCODING && value < 0x100)
- goto recode_encoding;
- break;
- }
- recode_encoding:
- if (! RExC_override_recoding) {
- SV* enc = _get_encoding();
- value = reg_recode((U8)value, &enc);
- if (!enc) {
- if (strict) {
- vFAIL("Invalid escape in the specified encoding");
- }
- else if (PASS2) {
- ckWARNreg(RExC_parse,
- "Invalid escape in the specified encoding");
- }
- }
break;
}
default:
else if (! SIZE_ONLY) {
/* Here, not in pass1 (in that pass we skip calculating the
- * contents of this class), and is /l, or is a POSIX class for
- * which /l doesn't matter (or is a Unicode property, which is
- * skipped here). */
+ * contents of this class), and is not /l, or is a POSIX class
+ * for which /l doesn't matter (or is a Unicode property, which
+ * is skipped here). */
if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
&cp_list);
}
}
- else if (UNI_SEMANTICS
+ else if ( UNI_SEMANTICS
|| classnum == _CC_ASCII
- || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
+ || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
|| classnum == _CC_XDIGIT)))
{
/* We usually have to worry about /d and /a affecting what
SvREFCNT_dec_NN(cp_foldable_list);
}
- /* And combine the result (if any) with any inversion list from posix
+ /* And combine the result (if any) with any inversion lists from posix
* classes. The lists are kept separate up to now because we don't want to
* fold the classes (folding of those is automatically handled by the swash
* fetching code) */
- if (simple_posixes) {
- _invlist_union(cp_list, simple_posixes, &cp_list);
- SvREFCNT_dec_NN(simple_posixes);
+ if (simple_posixes) { /* These are the classes known to be unaffected by
+ /a, /aa, and /d */
+ if (cp_list) {
+ _invlist_union(cp_list, simple_posixes, &cp_list);
+ SvREFCNT_dec_NN(simple_posixes);
+ }
+ else {
+ cp_list = simple_posixes;
+ }
}
if (posixes || nposixes) {
- if (posixes && AT_LEAST_ASCII_RESTRICTED) {
+
+ /* We have to adjust /a and /aa */
+ if (AT_LEAST_ASCII_RESTRICTED) {
+
/* Under /a and /aa, nothing above ASCII matches these */
- _invlist_intersection(posixes,
- PL_XPosix_ptrs[_CC_ASCII],
- &posixes);
- }
- if (nposixes) {
- if (DEPENDS_SEMANTICS) {
- /* Under /d, everything in the upper half of the Latin1 range
- * matches these complements */
- ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ if (posixes) {
+ _invlist_intersection(posixes,
+ PL_XPosix_ptrs[_CC_ASCII],
+ &posixes);
}
- else if (AT_LEAST_ASCII_RESTRICTED) {
- /* Under /a and /aa, everything above ASCII matches these
- * complements */
+
+ /* Under /a and /aa, everything above ASCII matches these
+ * complements */
+ if (nposixes) {
_invlist_union_complement_2nd(nposixes,
PL_XPosix_ptrs[_CC_ASCII],
&nposixes);
}
- if (posixes) {
- _invlist_union(posixes, nposixes, &posixes);
- SvREFCNT_dec_NN(nposixes);
- }
- else {
- posixes = nposixes;
- }
}
+
if (! DEPENDS_SEMANTICS) {
- if (cp_list) {
- _invlist_union(cp_list, posixes, &cp_list);
- SvREFCNT_dec_NN(posixes);
+
+ /* For everything but /d, we can just add the current 'posixes' and
+ * 'nposixes' to the main list */
+ if (posixes) {
+ if (cp_list) {
+ _invlist_union(cp_list, posixes, &cp_list);
+ SvREFCNT_dec_NN(posixes);
+ }
+ else {
+ cp_list = posixes;
+ }
}
- else {
- cp_list = posixes;
+ if (nposixes) {
+ if (cp_list) {
+ _invlist_union(cp_list, nposixes, &cp_list);
+ SvREFCNT_dec_NN(nposixes);
+ }
+ else {
+ cp_list = nposixes;
+ }
}
}
else {
- /* Under /d, we put into a separate list the Latin1 things that
- * match only when the target string is utf8 */
- SV* nonascii_but_latin1_properties = NULL;
- _invlist_intersection(posixes, PL_UpperLatin1,
- &nonascii_but_latin1_properties);
- _invlist_subtract(posixes, nonascii_but_latin1_properties,
- &posixes);
- if (cp_list) {
- _invlist_union(cp_list, posixes, &cp_list);
- SvREFCNT_dec_NN(posixes);
+ /* Under /d, things like \w match upper Latin1 characters only if
+ * the target string is in UTF-8. But things like \W match all the
+ * upper Latin1 characters if the target string is not in UTF-8.
+ *
+ * Handle the case where there something like \W separately */
+ if (nposixes) {
+ SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
+
+ /* A complemented posix class matches all upper Latin1
+ * characters if not in UTF-8. And it matches just certain
+ * ones when in UTF-8. That means those certain ones are
+ * matched regardless, so can just be added to the
+ * unconditional list */
+ if (cp_list) {
+ _invlist_union(cp_list, nposixes, &cp_list);
+ SvREFCNT_dec_NN(nposixes);
+ nposixes = NULL;
+ }
+ else {
+ cp_list = nposixes;
+ }
+
+ /* Likewise for 'posixes' */
+ _invlist_union(posixes, cp_list, &cp_list);
+
+ /* Likewise for anything else in the range that matched only
+ * under UTF-8 */
+ if (has_upper_latin1_only_utf8_matches) {
+ _invlist_union(cp_list,
+ has_upper_latin1_only_utf8_matches,
+ &cp_list);
+ SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+ has_upper_latin1_only_utf8_matches = NULL;
+ }
+
+ /* If we don't match all the upper Latin1 characters regardless
+ * of UTF-8ness, we have to set a flag to match the rest when
+ * not in UTF-8 */
+ _invlist_subtract(only_non_utf8_list, cp_list,
+ &only_non_utf8_list);
+ if (_invlist_len(only_non_utf8_list) != 0) {
+ ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ }
}
else {
- cp_list = posixes;
- }
-
- if (has_upper_latin1_only_utf8_matches) {
+ /* Here there were no complemented posix classes. That means
+ * the upper Latin1 characters in 'posixes' match only when the
+ * target string is in UTF-8. So we have to add them to the
+ * list of those types of code points, while adding the
+ * remainder to the unconditional list.
+ *
+ * First calculate what they are */
+ SV* nonascii_but_latin1_properties = NULL;
+ _invlist_intersection(posixes, PL_UpperLatin1,
+ &nonascii_but_latin1_properties);
+
+ /* And add them to the final list of such characters. */
_invlist_union(has_upper_latin1_only_utf8_matches,
nonascii_but_latin1_properties,
&has_upper_latin1_only_utf8_matches);
- SvREFCNT_dec_NN(nonascii_but_latin1_properties);
- }
- else {
- has_upper_latin1_only_utf8_matches
- = nonascii_but_latin1_properties;
+
+ /* Remove them from what now becomes the unconditional list */
+ _invlist_subtract(posixes, nonascii_but_latin1_properties,
+ &posixes);
+
+ /* And add those unconditional ones to the final list */
+ if (cp_list) {
+ _invlist_union(cp_list, posixes, &cp_list);
+ SvREFCNT_dec_NN(posixes);
+ posixes = NULL;
+ }
+ else {
+ cp_list = posixes;
+ }
+
+ SvREFCNT_dec(nonascii_but_latin1_properties);
+
+ /* Get rid of any characters that we now know are matched
+ * unconditionally from the conditional list, which may make
+ * that list empty */
+ _invlist_subtract(has_upper_latin1_only_utf8_matches,
+ cp_list,
+ &has_upper_latin1_only_utf8_matches);
+ if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
+ SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
+ has_upper_latin1_only_utf8_matches = NULL;
+ }
}
}
}
invlist_iterfinish(cp_list);
}
}
-
-#define MATCHES_ALL_NON_UTF8_NON_ASCII(ret) \
- ( DEPENDS_SEMANTICS \
- && (ANYOF_FLAGS(ret) \
- & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
-
- /* See if we can simplify things under /d */
- if ( has_upper_latin1_only_utf8_matches
- || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
+ else if ( DEPENDS_SEMANTICS
+ && ( has_upper_latin1_only_utf8_matches
+ || (ANYOF_FLAGS(ret) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
{
- /* But not if we are inverting, as that screws it up */
- if (! invert) {
- if (has_upper_latin1_only_utf8_matches) {
- if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
-
- /* Here, we have both the flag and inversion list. Any
- * character in 'has_upper_latin1_only_utf8_matches'
- * matches when UTF-8 is in effect, but it also matches
- * when UTF-8 is not in effect because of
- * MATCHES_ALL_NON_UTF8_NON_ASCII. Therefore it matches
- * unconditionally, so can be added to the regular list,
- * and 'has_upper_latin1_only_utf8_matches' cleared */
- _invlist_union(cp_list,
- has_upper_latin1_only_utf8_matches,
- &cp_list);
- SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
- has_upper_latin1_only_utf8_matches = NULL;
- }
- else if (cp_list) {
-
- /* Here, 'cp_list' gives chars that always match, and
- * 'has_upper_latin1_only_utf8_matches' gives chars that
- * were specified to match only if the target string is in
- * UTF-8. It may be that these overlap, so we can subtract
- * the unconditionally matching from the conditional ones,
- * to make the conditional list as small as possible,
- * perhaps even clearing it, in which case more
- * optimizations are possible later */
- _invlist_subtract(has_upper_latin1_only_utf8_matches,
- cp_list,
- &has_upper_latin1_only_utf8_matches);
- if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) {
- SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches);
- has_upper_latin1_only_utf8_matches = NULL;
- }
- }
- }
-
- /* Similarly, if the unconditional matches include every upper
- * latin1 character, we can clear that flag to permit later
- * optimizations */
- if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) {
- SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1);
- _invlist_subtract(only_non_utf8_list, cp_list,
- &only_non_utf8_list);
- if (_invlist_len(only_non_utf8_list) == 0) {
- ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
- }
- SvREFCNT_dec_NN(only_non_utf8_list);
- only_non_utf8_list = NULL;;
- }
- }
-
- /* If we haven't gotten rid of all conditional matching, we change the
- * regnode type to indicate that */
- if ( has_upper_latin1_only_utf8_matches
- || MATCHES_ALL_NON_UTF8_NON_ASCII(ret))
- {
- OP(ret) = ANYOFD;
- optimizable = FALSE;
- }
+ OP(ret) = ANYOFD;
+ optimizable = FALSE;
}
-#undef MATCHES_ALL_NON_UTF8_NON_ASCII
+
/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
* at compile time. Besides not inverting folded locale now, we can't
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
- FALSE /* Don't assume /x */ );
+ FALSE /* Don't force /x */ );
}
}
PERL_ARGS_ASSERT_REGPROP;
- sv_setpvn(sv, "", 0);
+ SvPVCLEAR(sv);
if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
/* It would be nice to FAIL() here, but this may be called from
= (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
- DEBUG_TRIE_COMPILE_r(
+ DEBUG_TRIE_COMPILE_r({
+ if (trie->jump)
+ sv_catpvs(sv, "(JUMP)");
Perl_sv_catpvf(aTHX_ sv,
"<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
(UV)trie->startstate,
(UV)TRIE_CHARCOUNT(trie),
(UV)trie->uniquecharcount
);
- );
+ });
if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
sv_catpvs(sv, "[");
(void) put_charclass_bitmap_innards(sv,
: TRIE_BITMAP(trie)),
NULL,
NULL,
- NULL
+ NULL,
+ FALSE
);
sv_catpvs(sv, "]");
}
-
} else if (k == CURLY) {
U32 lo = ARG1(o), hi = ARG2(o);
if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
/* And things that aren't in the bitmap, but are small enough to be */
SV* bitmap_range_not_in_bitmap = NULL;
+ const bool inverted = flags & ANYOF_INVERT;
+
if (OP(o) == ANYOFL) {
if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
sv_catpvs(sv, "{utf8-locale-reqd}");
ANYOF_BITMAP(o),
bitmap_range_not_in_bitmap,
only_utf8_locale_invlist,
- o);
+ o,
+
+ /* Can't try inverting for a
+ * better display if there are
+ * things that haven't been
+ * resolved */
+ unresolved != NULL);
SvREFCNT_dec(bitmap_range_not_in_bitmap);
/* If there are user-defined properties which haven't been defined yet,
- * output them, in a separate [] from the bitmap range stuff */
+ * output them. If the result is not to be inverted, it is clearest to
+ * output them in a separate [] from the bitmap range stuff. If the
+ * result is to be complemented, we have to show everything in one [],
+ * as the inversion applies to the whole thing. Use {braces} to
+ * separate them from anything in the bitmap and anything above the
+ * bitmap. */
if (unresolved) {
- if (do_sep) {
- Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
+ if (inverted) {
+ if (! do_sep) { /* If didn't output anything in the bitmap */
+ sv_catpvs(sv, "^");
+ }
+ sv_catpvs(sv, "{");
}
- if (flags & ANYOF_INVERT) {
- sv_catpvs(sv, "^");
+ else if (do_sep) {
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
}
sv_catsv(sv, unresolved);
- do_sep = TRUE;
- SvREFCNT_dec_NN(unresolved);
+ if (inverted) {
+ sv_catpvs(sv, "}");
+ }
+ do_sep = ! inverted;
}
/* And, finally, add the above-the-bitmap stuff */
Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
}
- /* And, for easy of understanding, it is always output not-shown as
- * complemented */
- if (flags & ANYOF_INVERT) {
+ /* And, for easy of understanding, it is shown in the
+ * uncomplemented form if possible. The one exception being if
+ * there are unresolved items, where the inversion has to be
+ * delayed until runtime */
+ if (inverted && ! unresolved) {
_invlist_invert(nonbitmap_invlist);
_invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist);
}
/* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+ SvREFCNT_dec(unresolved);
}
else if (k == POSIXD || k == NPOSIXD) {
U8 index = FLAGS(o) * 2;
* mnemonic names. Split off any of those at the beginning and end of
* the range to print mnemonically. It isn't possible for many of
* these to be in a row, so this won't overwhelm with output */
- while (isMNEMONIC_CNTRL(start) && start <= end) {
- put_code_point(sv, start);
- start++;
- }
- if (start < end && isMNEMONIC_CNTRL(end)) {
-
- /* Here, the final character in the range has a mnemonic name.
- * Work backwards from the end to find the final non-mnemonic */
- UV temp_end = end - 1;
- while (isMNEMONIC_CNTRL(temp_end)) {
- temp_end--;
+ if ( start <= end
+ && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end)))
+ {
+ while (isMNEMONIC_CNTRL(start) && start <= end) {
+ put_code_point(sv, start);
+ start++;
}
- /* And separately output the interior range that doesn't start or
- * end with mnemonics */
- put_range(sv, start, temp_end, FALSE);
+ /* If this didn't take care of the whole range ... */
+ if (start <= end) {
- /* Then output the mnemonic trailing controls */
- start = temp_end + 1;
- while (start <= end) {
- put_code_point(sv, start);
- start++;
+ /* Look backwards from the end to find the final non-mnemonic
+ * */
+ UV temp_end = end;
+ while (isMNEMONIC_CNTRL(temp_end)) {
+ temp_end--;
+ }
+
+ /* And separately output the interior range that doesn't start
+ * or end with mnemonics */
+ put_range(sv, start, temp_end, FALSE);
+
+ /* Then output the mnemonic trailing controls */
+ start = temp_end + 1;
+ while (start <= end) {
+ put_code_point(sv, start);
+ start++;
+ }
+ break;
}
- break;
}
/* As a final resort, output the range or subrange as hex. */
)
{
/* Create and return an SV containing a displayable version of the bitmap
- * and associated information determined by the input parameters. */
+ * and associated information determined by the input parameters. If the
+ * output would have been only the inversion indicator '^', NULL is instead
+ * returned. */
SV * output;
}
}
- /* If the only thing we output is the '^', clear it */
if (invert && SvCUR(output) == 1) {
- SvCUR_set(output, 0);
+ return NULL;
}
return output;
char *bitmap,
SV *nonbitmap_invlist,
SV *only_utf8_locale_invlist,
- const regnode * const node)
+ const regnode * const node,
+ const bool force_as_is_display)
{
/* Appends to 'sv' a displayable version of the innards of the bracketed
* character class defined by the other arguments:
* 'node' is the regex pattern node. It is needed only when the above two
* parameters are not null, and is passed so that this routine can
* tease apart the various reasons for them.
+ * 'force_as_is_display' is TRUE if this routine should definitely NOT try
+ * to invert things to see if that leads to a cleaner display. If
+ * FALSE, this routine is free to use its judgment about doing this.
*
* It returns TRUE if there was actually something output. (It may be that
* the bitmap, etc is empty.)
*
* When called for outputting the bitmap of a non-ANYOF node, just pass the
- * bitmap, with the succeeding parameters set to NULL.
- *
+ * bitmap, with the succeeding parameters set to NULL, and the final one to
+ * FALSE.
*/
/* In general, it tries to display the 'cleanest' representation of the
* whether the class itself is to be inverted. However, there are some
* cases where it can't try inverting, as what actually matches isn't known
* until runtime, and hence the inversion isn't either. */
- bool inverting_allowed = TRUE;
+ bool inverting_allowed = ! force_as_is_display;
int i;
STRLEN orig_sv_cur = SvCUR(sv);
is UTF-8 */
SV* as_is_display; /* The output string when we take the inputs
- literally */
+ literally */
SV* inverted_display; /* The output string when we invert the inputs */
U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
/* If have to take the output as-is, just do that */
if (! inverting_allowed) {
- sv_catsv(sv, as_is_display);
+ if (as_is_display) {
+ sv_catsv(sv, as_is_display);
+ SvREFCNT_dec_NN(as_is_display);
+ }
}
else { /* But otherwise, create the output again on the inverted input, and
use whichever version is shorter */
_invlist_invert(only_utf8);
_invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8);
}
+ else if (not_utf8) {
- if (not_utf8) {
- _invlist_invert(not_utf8);
- _invlist_intersection(not_utf8, PL_UpperLatin1, ¬_utf8);
+ /* If a code point matches iff the target string is not in UTF-8,
+ * then complementing the result has it not match iff not in UTF-8,
+ * which is the same thing as matching iff it is UTF-8. */
+ only_utf8 = not_utf8;
+ not_utf8 = NULL;
}
if (only_utf8_locale) {
/* Use the shortest representation, taking into account our bias
* against showing it inverted */
- if (SvCUR(inverted_display) + inverted_bias
- < SvCUR(as_is_display) + as_is_bias)
+ if ( inverted_display
+ && ( ! as_is_display
+ || ( SvCUR(inverted_display) + inverted_bias
+ < SvCUR(as_is_display) + as_is_bias)))
{
sv_catsv(sv, inverted_display);
}
- else {
+ else if (as_is_display) {
sv_catsv(sv, as_is_display);
}
- SvREFCNT_dec_NN(as_is_display);
- SvREFCNT_dec_NN(inverted_display);
+ SvREFCNT_dec(as_is_display);
+ SvREFCNT_dec(inverted_display);
}
SvREFCNT_dec_NN(invlist);
#endif
const regnode *nextbranch= NULL;
I32 word_idx;
- sv_setpvs(sv, "");
+ SvPVCLEAR(sv);
for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);