#endif
#include "dquote_static.c"
+#ifndef PERL_IN_XSUB_RE
+# include "charclass_invlists.h"
+#endif
#ifdef op
#undef op
* The adjacent nodes actually may be separated by NOTHING kind nodes, and
* these get optimized out
*
- * If there are problematic code sequences, *min_change is set to the delta
- * that the minimum size of the node can be off from its actual size. And, the
- * node type of the result is changed to reflect that it contains these
+ * If there are problematic code sequences, *min_subtract is set to the delta
+ * that the minimum size of the node can be less than its actual size. And,
+ * the node type of the result is changed to reflect that it contains these
* sequences.
*
- * And *has_exactf_sharp_s is set to indicate if the node is EXACTF and
- * contains LATIN SMALL LETTER SHARP S
+ * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
+ * and contains LATIN SMALL LETTER SHARP S
*
* This is as good a place as any to discuss the design of handling these
* problematic sequences. It's been wrong in Perl for a very long time. There
* those have already been dealt with. These would otherwise be the most
* likely candidates for generating further tricky sequences. In other words,
* Unicode by itself is unlikely to add new ones unless it is for compatibility
- * with pre-existing standards.
+ * with pre-existing standards, and there aren't many of those left.
*
* The previous designs for dealing with these involved assigning a special
* node for them. This approach doesn't work, as evidenced by this example:
- * "\xDFs" =~ /s\xDF/ui
+ * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
* Both these fold to "sss", but if the pattern is parsed to create a node of
* that would match just the \xDF, it won't be able to handle the case where a
* successful match would have to cross the node's boundary. The new approach
* There are a number of components to the approach (a lot of work for just
* three code points!):
* 1) This routine examines each EXACTFish node that could contain the
- * problematic sequences, and if found, returns in *min_change the total
- * delta between the actual length of the string and one that could match
- * it. This delta is used by the caller to adjust the min length of the
- * match, and the delta between min and max, so that the optimizer doesn't
- * reject these possibilities based on size constraints
+ * problematic sequences. It returns in *min_subtract how much to
+ * subtract from the the actual length of the string to get a real minimum
+ * for one that could match it. This number is usually 0 except for the
+ * problematic sequences. This delta is used by the caller to adjust the
+ * min length of the match, and the delta between min and max, so that the
+ * optimizer doesn't reject these possibilities based on size constraints.
* 2) These sequences are not currently correctly handled by the trie code
* either, so it changes the joined node type to ops that are not handled
* by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
* takes advantage of this. Generally, an EXACTFish node that is in UTF-8
* is pre-folded by regcomp.c. This saves effort in regex matching.
* However, probably mostly for historical reasons, the pre-folding isn't
- * done for non-UTF8 patterns. The fold possibilities for these are quite
- * simple, except for the sharp s. All the ones that don't involve a
- * UTF-8 target string are members of a fold-pair, and arrays are set up
- * for all of them that quickly find the other member of the pair. It
- * might actually be faster to pre-fold these, but it isn't currently
- * done, except for the sharp s. Code elsewhere in this file makes sure
- * that it gets folded to 'ss', even if the pattern isn't UTF-8. This
- * avoids the issues describe in the next item.
+ * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
+ * nodes, as what they fold to isn't known until runtime.) The fold
+ * possibilities for the non-UTF8 patterns are quite simple, except for
+ * the sharp s. All the ones that don't involve a UTF-8 target string
+ * are members of a fold-pair, and arrays are set up for all of them
+ * that quickly find the other member of the pair. It might actually
+ * be faster to pre-fold these, but it isn't currently done, except for
+ * the sharp s. Code elsewhere in this file makes sure that it gets
+ * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
+ * issues described in the next item.
* 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
* 'ss' or not is not knowable at compile time. It will match iff the
* target string is in UTF-8, unlike the EXACTFU nodes, where it always
* cases are either 1-1 folds when no UTF-8 is involved; or is true by
* virtue of having this file pre-fold UTF-8 patterns. I'm
* reluctant to try to change this assumption, so instead the code punts.
- * This routine examines EXACTF nodes for the sharp s, and returns whether
- * the node is an EXACTF node that contains one or not. When it is true,
- * the caller sets a flag that later causes the optimizer in this file to
- * not set values for the floating and fixed string lengths, and thus
- * avoid the optimizer code in regexec.c that makes this invalid
- * assumption. Thus, there is no optimization based on string lengths for
- * EXACTF nodes that contain the sharp s. This only happens for /id rules
- * (which means the pattern isn't in UTF-8).
+ * This routine examines EXACTF nodes for the sharp s, and returns a
+ * boolean indicating whether or not the node is an EXACTF node that
+ * contains a sharp s. When it is true, the caller sets a flag that later
+ * causes the optimizer in this file to not set values for the floating
+ * and fixed string lengths, and thus avoids the optimizer code in
+ * regexec.c that makes the invalid assumption. Thus, there is no
+ * optimization based on string lengths for EXACTF nodes that contain the
+ * sharp s. This only happens for /id rules (which means the pattern
+ * isn't in UTF-8).
*/
-#define JOIN_EXACT(scan,min_change,has_exactf_sharp_s, flags) \
+#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
if (PL_regkind[OP(scan)] == EXACT) \
- join_exact(pRExC_state,(scan),(min_change),has_exactf_sharp_s, (flags),NULL,depth+1)
+ join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
STATIC U32
-S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
+S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
/* Merge several consecutive EXACTish nodes into one. */
regnode *n = regnext(scan);
U32 stringok = 1;
}
#endif
}
-#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
-#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
-#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
-#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
- *min_change = 0;
+ *min_subtract = 0;
*has_exactf_sharp_s = FALSE;
/* Here, all the adjacent mergeable EXACTish nodes have been merged. We
* can now analyze for sequences of problematic code points. (Prior to
* this final joining, sequences could have been split over boundaries, and
- * hence missed). The sequences only happen in folding */
+ * hence missed). The sequences only happen in folding, hence for any
+ * non-EXACT EXACTish node */
if (OP(scan) != EXACT) {
U8 *s;
U8 * s0 = (U8*) STRING(scan);
* This uses an exclusive 'or' to find that bit and then inverts it to
* form a mask, with just a single 0, in the bit position where 'S' and
* 's' differ. */
- const U8 S_or_s_mask = ~ ('S' ^ 's');
+ const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
const U8 s_masked = 's' & S_or_s_mask;
/* One pass is made over the node's string looking for all the
case 's':
case 'S':
+ /* Current character is an 's' or 'S'. If next one is
+ * as well, we have the dreaded sequence */
if (((*(s+1) & S_or_s_mask) == s_masked)
/* These two node types don't have special handling
* for 'ss' */
&& OP(scan) != EXACTFL && OP(scan) != EXACTFA)
{
- *min_change -= 1;
+ *min_subtract += 1;
OP(scan) = EXACTFU_SS;
s++; /* No need to look at this character again */
}
break;
}
greek_sequence:
- *min_change -= 4;
+ *min_subtract += 4;
/* This can't currently be handled by trie's, so change
* the node type to indicate this. If EXACTFA and
if (s_end - s > 1
&& ((*(s+1) & S_or_s_mask) == s_masked))
{
- *min_change -= 1;
+ *min_subtract += 1;
/* EXACTF nodes need to know that the minimum
* length changed so that a sharp s in the string
fake_study_recurse:
while ( scan && OP(scan) != END && scan < last ){
- IV min_change = 0;
+ UV min_subtract = 0; /* How much to subtract from the minimum node
+ length to get a real minimum (because the
+ folded version may be shorter) */
bool has_exactf_sharp_s = FALSE;
/* Peephole optimizer: */
DEBUG_STUDYDATA("Peep:", data,depth);
DEBUG_PEEP("Peep",scan,depth);
- JOIN_EXACT(scan,&min_change, &has_exactf_sharp_s, 0);
+
+ /* Its not clear to khw or hv why this is done here, and not in the
+ * clauses that deal with EXACT nodes. khw's guess is that it's
+ * because of a previous design */
+ JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
/* Follow the next-chain of the current node and optimize
away all the NOTHINGs from it. */
#define TRIE_TYPE_IS_SAFE 1
+Note that join_exact() assumes that the other types of EXACTFish nodes are not
+used in tries, so that would have to be updated if this changed
+
*/
#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
else if (has_exactf_sharp_s) {
RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
}
- min += l + min_change;
+ min += l - min_subtract;
if (min < 0) {
min = 0;
}
- delta += abs(min_change);
+ delta += min_subtract;
if (flags & SCF_DO_SUBSTR) {
- data->pos_min += l + min_change;
+ data->pos_min += l - min_subtract;
if (data->pos_min < 0) {
data->pos_min = 0;
}
- data->pos_delta += abs(min_change);
- if (min_change) {
+ data->pos_delta += min_subtract;
+ if (min_subtract) {
data->longest = &(data->longest_float);
}
}
/* Also set the other member of the fold pair. In case
* that unicode semantics is called for at runtime, use
* the full latin1 fold. (Can't do this for locale,
- * because not known until runtime */
+ * because not known until runtime) */
ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
- /* All folds except under /iaa that include s, S, and
- * sharp_s also may include the others */
+ /* All other (EXACTFL handled above) folds except under
+ * /iaa that include s, S, and sharp_s also may include
+ * the others */
if (OP(scan) != EXACTFA) {
if (uc == 's' || uc == 'S') {
ANYOF_BITMAP_SET(data->start_class,
DEBUG_r(if (!PL_colorset) reginitcolors());
+#ifndef PERL_IN_XSUB_RE
+ /* Initialize these here instead of as-needed, as is quick and avoids
+ * having to test them each time otherwise */
+ if (! PL_AboveLatin1) {
+ PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
+ PL_ASCII = _new_invlist_C_array(ASCII_invlist);
+ PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
+
+ PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
+ PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
+
+ PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
+ PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
+
+ PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
+ PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
+
+ PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
+
+ PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
+ PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
+
+ PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
+
+ PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
+ PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
+
+ PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
+ PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
+
+ PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
+ PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
+
+ PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
+ PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
+
+ PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
+ PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
+
+ PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
+ PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
+
+ PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
+ PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
+
+ PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
+ PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
+
+ PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
+
+ PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
+ PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
+
+ PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
+ PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
+ }
+#endif
+
exp = SvPV(pattern, plen);
if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
{
I32 t,ml;
+ /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
|| (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
&& data.offset_fixed == data.offset_float_min
Be careful.
*/
longest_fixed_length = CHR_SVLEN(data.longest_fixed);
+
+ /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
&& (longest_fixed_length
|| (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
-#define INVLIST_ZERO_OFFSET 2 /* 0 or 1; must be last element in header */
+/* This is a combination of a version and data structure type, so that one
+ * being passed in can be validated to be an inversion list of the correct
+ * vintage. When the structure of the header is changed, a new random number
+ * in the range 2**31-1 should be generated and the new() method changed to
+ * insert that at this location. Then, if an auxiliary program doesn't change
+ * correspondingly, it will be discovered immediately */
+#define INVLIST_VERSION_ID_OFFSET 2
+#define INVLIST_VERSION_ID 1064334010
+
+/* For safety, when adding new elements, remember to #undef them at the end of
+ * the inversion list code section */
+
+#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
* contains the code point U+00000, and begins here. If 1, the inversion list
* doesn't contain U+0000, and it begins at the next UV in the array.
* properly */
*get_invlist_zero_addr(new_list) = UV_MAX;
+ *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
+#if HEADER_LENGTH != 4
+# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
+#endif
+
return new_list;
}
#endif
+STATIC SV*
+S__new_invlist_C_array(pTHX_ UV* list)
+{
+ /* Return a pointer to a newly constructed inversion list, initialized to
+ * point to <list>, which has to be in the exact correct inversion list
+ * form, including internal fields. Thus this is a dangerous routine that
+ * should not be used in the wrong hands */
+
+ SV* invlist = newSV_type(SVt_PV);
+
+ PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
+
+ SvPV_set(invlist, (char *) list);
+ SvLEN_set(invlist, 0); /* Means we own the contents, and the system
+ shouldn't touch it */
+ SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
+
+ if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
+ Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
+ }
+
+ return invlist;
+}
+
STATIC void
S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
{
#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
+#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
+
#ifndef PERL_IN_XSUB_RE
void
Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
return;
}
+
void
-Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
+Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
{
/* Take the union of two inversion lists and point <output> to it. *output
* should be defined upon input, and if it points to one of the two lists,
- * the reference count to that list will be decremented.
+ * the reference count to that list will be decremented. The first list,
+ * <a>, may be NULL, in which case a copy of the second list is returned.
+ * If <complement_b> is TRUE, the union is taken of the complement
+ * (inversion) of <b> instead of b itself.
+ *
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
* length there. The preface says to incorporate its examples into your
*/
UV count = 0;
- PERL_ARGS_ASSERT__INVLIST_UNION;
+ PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
/* If either one is empty, the union is the other one */
- len_a = invlist_len(a);
- if (len_a == 0) {
+ if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
if (*output == a) {
- SvREFCNT_dec(a);
+ if (a != NULL) {
+ SvREFCNT_dec(a);
+ }
}
if (*output != b) {
*output = invlist_clone(b);
+ if (complement_b) {
+ _invlist_invert(*output);
+ }
} /* else *output already = b; */
return;
}
if (*output == b) {
SvREFCNT_dec(b);
}
- if (*output != a) {
- *output = invlist_clone(a);
- }
- /* else *output already = a; */
+
+ /* The complement of an empty list is a list that has everything in it,
+ * so the union with <a> includes everything too */
+ if (complement_b) {
+ if (a == *output) {
+ SvREFCNT_dec(a);
+ }
+ *output = _new_invlist(1);
+ _append_range_to_invlist(*output, 0, UV_MAX);
+ }
+ else if (*output != a) {
+ *output = invlist_clone(a);
+ }
+ /* else *output already = a; */
return;
}
array_a = invlist_array(a);
array_b = invlist_array(b);
+ /* If are to take the union of 'a' with the complement of b, set it
+ * up so are looking at b's complement. */
+ if (complement_b) {
+
+ /* To complement, we invert: if the first element is 0, remove it. To
+ * do this, we just pretend the array starts one later, and clear the
+ * flag as we don't have to do anything else later */
+ if (array_b[0] == 0) {
+ array_b++;
+ len_b--;
+ complement_b = FALSE;
+ }
+ else {
+
+ /* But if the first element is not zero, we unshift a 0 before the
+ * array. The data structure reserves a space for that 0 (which
+ * should be a '1' right now), so physical shifting is unneeded,
+ * but temporarily change that element to 0. Before exiting the
+ * routine, we must restore the element to '1' */
+ array_b--;
+ len_b++;
+ array_b[0] = 0;
+ }
+ }
+
/* Size the union for the worst case: that the sets are completely
* disjoint */
u = _new_invlist(len_a + len_b);
SvREFCNT_dec(*output);
}
+ /* If we've changed b, restore it */
+ if (complement_b) {
+ array_b[0] = 1;
+ }
+
*output = u;
return;
}
void
-Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
+Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
{
/* Take the intersection of two inversion lists and point <i> to it. *i
* should be defined upon input, and if it points to one of the two lists,
* the reference count to that list will be decremented.
+ * If <complement_b> is TRUE, the result will be the intersection of <a>
+ * and the complement (or inversion) of <b> instead of <b> directly.
+ *
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
* length there. The preface says to incorporate its examples into your
*/
UV count = 0;
- PERL_ARGS_ASSERT__INVLIST_INTERSECTION;
+ PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
- /* If either one is empty, the intersection is null */
+ /* Special case if either one is empty */
len_a = invlist_len(a);
if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
- /* If the result is the same as one of the inputs, the input is being
- * overwritten */
+ if (len_a != 0 && complement_b) {
+
+ /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
+ * be empty. Here, also we are using 'b's complement, which hence
+ * must be every possible code point. Thus the intersection is
+ * simply 'a'. */
+ if (*i != a) {
+ *i = invlist_clone(a);
+
+ if (*i == b) {
+ SvREFCNT_dec(b);
+ }
+ }
+ /* else *i is already 'a' */
+ return;
+ }
+
+ /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
+ * intersection must be empty */
if (*i == a) {
SvREFCNT_dec(a);
}
else if (*i == b) {
SvREFCNT_dec(b);
}
-
*i = _new_invlist(0);
return;
}
array_a = invlist_array(a);
array_b = invlist_array(b);
+ /* If are to take the intersection of 'a' with the complement of b, set it
+ * up so are looking at b's complement. */
+ if (complement_b) {
+
+ /* To complement, we invert: if the first element is 0, remove it. To
+ * do this, we just pretend the array starts one later, and clear the
+ * flag as we don't have to do anything else later */
+ if (array_b[0] == 0) {
+ array_b++;
+ len_b--;
+ complement_b = FALSE;
+ }
+ else {
+
+ /* But if the first element is not zero, we unshift a 0 before the
+ * array. The data structure reserves a space for that 0 (which
+ * should be a '1' right now), so physical shifting is unneeded,
+ * but temporarily change that element to 0. Before exiting the
+ * routine, we must restore the element to '1' */
+ array_b--;
+ len_b++;
+ array_b[0] = 0;
+ }
+ }
+
/* Size the intersection for the worst case: that the intersection ends up
* fragmenting everything to be completely disjoint */
r= _new_invlist(len_a + len_b);
SvREFCNT_dec(*i);
}
+ /* If we've changed b, restore it */
+ if (complement_b) {
+ array_b[0] = 1;
+ }
+
*i = r;
return;
}
return new_invlist;
}
-#ifndef PERL_IN_XSUB_RE
-void
-Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
-{
- /* Point <result> to an inversion list which consists of all elements in
- * <a> that aren't also in <b>. *result should be defined upon input, and
- * if it points to C<b> its reference count will be decremented. */
-
- PERL_ARGS_ASSERT__INVLIST_SUBTRACT;
- assert(a != b);
-
- /* Subtracting nothing retains the original */
- if (invlist_len(b) == 0) {
-
- if (*result == b) {
- SvREFCNT_dec(b);
- }
-
- /* If the result is not to be the same variable as the original, create
- * a copy */
- if (*result != a) {
- *result = invlist_clone(a);
- }
- } else {
- SV *b_copy = invlist_clone(b);
- _invlist_invert(b_copy); /* Everything not in 'b' */
-
- if (*result == b) {
- SvREFCNT_dec(b);
- }
-
- _invlist_intersection(a, b_copy, result); /* Everything in 'a' not in
- 'b' */
- SvREFCNT_dec(b_copy);
- }
-
- return;
-}
-#endif
-
PERL_STATIC_INLINE UV*
S_get_invlist_iter_addr(pTHX_ SV* invlist)
{
return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
}
+PERL_STATIC_INLINE UV*
+S_get_invlist_version_id_addr(pTHX_ SV* invlist)
+{
+ /* Return the address of the UV that contains the version id. */
+
+ PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
+
+ return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
+}
+
PERL_STATIC_INLINE void
S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
{
#undef INVLIST_LEN_OFFSET
#undef INVLIST_ZERO_OFFSET
#undef INVLIST_ITER_OFFSET
+#undef INVLIST_VERSION_ID
/* End of inversion list object */
char *s;
STRLEN foldlen;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
- regnode * orig_emit;
U8 node_type;
/* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
bool is_exactfu_sharp_s;
ender = 0;
- orig_emit = RExC_emit; /* Save the original output node position in
- case we need to output a different node
- type */
node_type = ((! FOLD) ? EXACT
: (LOC)
? EXACTFL
/* Prime the casefolded buffer. Locale rules, which apply
* only to code points < 256, aren't known until execution,
* so for them, just output the original character using
- * utf8 */
+ * utf8. If we start to fold non-UTF patterns, be sure to
+ * update join_exact() */
if (LOC && ender < 256) {
if (UNI_IS_INVARIANT(ender)) {
*tmpbuf = (U8) ender;
}
}
-/* No locale test, and always Unicode semantics, no ignore-case differences */
-#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
-ANYOF_##NAME: \
- for (value = 0; value < 256; value++) \
- if (TEST) \
- stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
- yesno = '+'; \
- what = WORD; \
- break; \
-case ANYOF_N##NAME: \
- for (value = 0; value < 256; value++) \
- if (!TEST) \
- stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
- yesno = '!'; \
- what = WORD; \
- break
-
-/* Like the above, but there are differences if we are in uni-8-bit or not, so
- * there are two tests passed in, to use depending on that. There aren't any
- * cases where the label is different from the name, so no need for that
- * parameter.
- * Sets 'what' to WORD which is the property name for non-bitmap code points;
- * But, uses FOLD_WORD instead if /i has been selected, to allow a different
- * property name */
-#define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD) \
-ANYOF_##NAME: \
- if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
- else if (UNI_SEMANTICS) { \
- for (value = 0; value < 256; value++) { \
- if (TEST_8(value)) stored += \
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
- } \
- } \
- else { \
- for (value = 0; value < 128; value++) { \
- if (TEST_7(UNI_TO_NATIVE(value))) stored += \
- set_regclass_bit(pRExC_state, ret, \
- (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
- } \
- } \
- yesno = '+'; \
- if (FOLD) { \
- what = FOLD_WORD; \
- } \
- else { \
- what = WORD; \
- } \
- break; \
-case ANYOF_N##NAME: \
- if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
- else if (UNI_SEMANTICS) { \
- for (value = 0; value < 256; value++) { \
- if (! TEST_8(value)) stored += \
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
- } \
- } \
- else { \
- for (value = 0; value < 128; value++) { \
- if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
- pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
- } \
- if (AT_LEAST_ASCII_RESTRICTED) { \
- for (value = 128; value < 256; value++) { \
- stored += set_regclass_bit( \
- pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
- } \
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \
- } \
- else { \
- /* For a non-ut8 target string with DEPENDS semantics, all above \
- * ASCII Latin1 code points match the complement of any of the \
- * classes. But in utf8, they have their Unicode semantics, so \
- * can't just set them in the bitmap, or else regexec.c will think \
- * they matched when they shouldn't. */ \
- ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \
- } \
- } \
- yesno = '!'; \
- if (FOLD) { \
- what = FOLD_WORD; \
- } \
- else { \
- what = WORD; \
- } \
- break
+/* Generate the code to add a full posix character <class> to the bracketed
+ * character class given by <node>. (<node> is needed only under locale rules)
+ * destlist is the inversion list for non-locale rules that this class is
+ * to be added to
+ * sourcelist is the ASCII-range inversion list to add under /a rules
+ * Xsourcelist is the full Unicode range list to use otherwise. */
+#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
+ if (LOC) { \
+ SV* scratch_list = NULL; \
+ \
+ /* Set this class in the node for runtime matching */ \
+ ANYOF_CLASS_SET(node, class); \
+ \
+ /* For above Latin1 code points, we use the full Unicode range */ \
+ _invlist_intersection(PL_AboveLatin1, \
+ Xsourcelist, \
+ &scratch_list); \
+ /* And set the output to it, adding instead if there already is an \
+ * output. Checking if <destlist> is NULL first saves an extra \
+ * clone. Its reference count will be decremented at the next \
+ * union, etc, or if this is the only instance, at the end of the \
+ * routine */ \
+ if (! destlist) { \
+ destlist = scratch_list; \
+ } \
+ else { \
+ _invlist_union(destlist, scratch_list, &destlist); \
+ SvREFCNT_dec(scratch_list); \
+ } \
+ } \
+ else { \
+ /* For non-locale, just add it to any existing list */ \
+ _invlist_union(destlist, \
+ (AT_LEAST_ASCII_RESTRICTED) \
+ ? sourcelist \
+ : Xsourcelist, \
+ &destlist); \
+ }
+
+/* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
+ */
+#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
+ if (LOC) { \
+ SV* scratch_list = NULL; \
+ ANYOF_CLASS_SET(node, class); \
+ _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
+ if (! destlist) { \
+ destlist = scratch_list; \
+ } \
+ else { \
+ _invlist_union(destlist, scratch_list, &destlist); \
+ SvREFCNT_dec(scratch_list); \
+ } \
+ } \
+ else { \
+ _invlist_union_complement_2nd(destlist, \
+ (AT_LEAST_ASCII_RESTRICTED) \
+ ? sourcelist \
+ : Xsourcelist, \
+ &destlist); \
+ /* Under /d, everything in the upper half of the Latin1 range \
+ * matches this complement */ \
+ if (DEPENDS_SEMANTICS) { \
+ ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
+ } \
+ }
+
+/* Generate the code to add a posix character <class> to the bracketed
+ * character class given by <node>. (<node> is needed only under locale rules)
+ * destlist is the inversion list for non-locale rules that this class is
+ * to be added to
+ * sourcelist is the ASCII-range inversion list to add under /a rules
+ * l1_sourcelist is the Latin1 range list to use otherwise.
+ * Xpropertyname is the name to add to <run_time_list> of the property to
+ * specify the code points above Latin1 that will have to be
+ * determined at run-time
+ * run_time_list is a SV* that contains text names of properties that are to
+ * be computed at run time. This concatenates <Xpropertyname>
+ * to it, apppropriately
+ * This is essentially DO_POSIX, but we know only the Latin1 values at compile
+ * time */
+#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
+ l1_sourcelist, Xpropertyname, run_time_list) \
+ /* If not /a matching, there are going to be code points we will have \
+ * to defer to runtime to look-up */ \
+ if (! AT_LEAST_ASCII_RESTRICTED) { \
+ Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
+ } \
+ if (LOC) { \
+ ANYOF_CLASS_SET(node, class); \
+ } \
+ else { \
+ _invlist_union(destlist, \
+ (AT_LEAST_ASCII_RESTRICTED) \
+ ? sourcelist \
+ : l1_sourcelist, \
+ &destlist); \
+ }
+
+/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
+ * this and DO_N_POSIX */
+#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
+ l1_sourcelist, Xpropertyname, run_time_list) \
+ if (AT_LEAST_ASCII_RESTRICTED) { \
+ _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
+ } \
+ else { \
+ Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
+ if (LOC) { \
+ ANYOF_CLASS_SET(node, namedclass); \
+ } \
+ else { \
+ SV* scratch_list = NULL; \
+ _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
+ if (! destlist) { \
+ destlist = scratch_list; \
+ } \
+ else { \
+ _invlist_union(destlist, scratch_list, &destlist); \
+ SvREFCNT_dec(scratch_list); \
+ } \
+ if (DEPENDS_SEMANTICS) { \
+ ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
+ } \
+ } \
+ }
STATIC U8
S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
/* Invert if asking for the complement */
if (value == 'P') {
-
- /* Add to any existing list */
- if (! properties) {
- properties = invlist_clone(invlist);
- _invlist_invert(properties);
- }
- else {
- invlist = invlist_clone(invlist);
- _invlist_invert(invlist);
- _invlist_union(properties, invlist, &properties);
- SvREFCNT_dec(invlist);
- }
+ _invlist_union_complement_2nd(properties, invlist, &properties);
/* The swash can't be used as-is, because we've
* inverted things; delay removing it to here after
swash = NULL;
}
else {
- if (! properties) {
- properties = invlist_clone(invlist);
- }
- else {
- _invlist_union(properties, invlist, &properties);
- }
+ _invlist_union(properties, invlist, &properties);
}
}
Safefree(name);
}
if (!SIZE_ONLY) {
- const char *what = NULL;
- char yesno = 0;
/* Possible truncation here but in some 64-bit environments
* the compiler gets heartburn about switch on 64-bit values.
* A similar issue a little earlier when switching on value.
* --jhi */
switch ((I32)namedclass) {
-
- case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum");
- case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha");
- case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank");
- case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl");
- case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph");
- case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i");
- case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint");
- case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace");
- case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct");
- case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i");
- /* \s, \w match all unicode if utf8. */
- case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl");
- case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word");
- case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit");
- case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
- case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
+
+ case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+ break;
+ case ANYOF_NALNUMC:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+ break;
+ case ANYOF_ALPHA:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+ break;
+ case ANYOF_NALPHA:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+ break;
case ANYOF_ASCII:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_ASCII);
- else {
- for (value = 0; value < 128; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
+ if (LOC) {
+ ANYOF_CLASS_SET(ret, namedclass);
}
- yesno = '+';
- what = NULL; /* Doesn't match outside ascii, so
- don't want to add +utf8:: */
+ else {
+ _invlist_union(properties, PL_ASCII, &properties);
+ }
break;
case ANYOF_NASCII:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NASCII);
- else {
- for (value = 128; value < 256; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
+ if (LOC) {
+ ANYOF_CLASS_SET(ret, namedclass);
}
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
- yesno = '!';
- what = "ASCII";
- break;
+ else {
+ _invlist_union_complement_2nd(properties,
+ PL_ASCII, &properties);
+ if (DEPENDS_SEMANTICS) {
+ ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
+ }
+ }
+ break;
+ case ANYOF_BLANK:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixBlank, PL_XPosixBlank);
+ break;
+ case ANYOF_NBLANK:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixBlank, PL_XPosixBlank);
+ break;
+ case ANYOF_CNTRL:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixCntrl, PL_XPosixCntrl);
+ break;
+ case ANYOF_NCNTRL:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixCntrl, PL_XPosixCntrl);
+ break;
case ANYOF_DIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+ break;
+ case ANYOF_NDIGIT:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+ break;
+ case ANYOF_GRAPH:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+ break;
+ case ANYOF_NGRAPH:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+ break;
+ case ANYOF_HORIZWS:
+ /* For these, we use the nonbitmap, as /d doesn't make a
+ * difference in what these match. There would be problems
+ * if these characters had folds other than themselves, as
+ * nonbitmap is subject to folding. It turns out that \h
+ * is just a synonym for XPosixBlank */
+ _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
+ break;
+ case ANYOF_NHORIZWS:
+ _invlist_union_complement_2nd(nonbitmap,
+ PL_XPosixBlank, &nonbitmap);
+ break;
+ case ANYOF_LOWER:
+ case ANYOF_NLOWER:
+ { /* These require special handling, as they differ under
+ folding, matching Cased there (which in the ASCII range
+ is the same as Alpha */
+
+ SV* ascii_source;
+ SV* l1_source;
+ const char *Xname;
+
+ if (FOLD && ! LOC) {
+ ascii_source = PL_PosixAlpha;
+ l1_source = PL_L1Cased;
+ Xname = "Cased";
+ }
else {
- /* consecutive digits assumed */
- for (value = '0'; value <= '9'; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
+ ascii_source = PL_PosixLower;
+ l1_source = PL_L1PosixLower;
+ Xname = "XPosixLower";
+ }
+ if (namedclass == ANYOF_LOWER) {
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ ascii_source, l1_source, Xname, listsv);
+ }
+ else {
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
+ properties, ascii_source, l1_source, Xname, listsv);
}
- yesno = '+';
- what = "Digit";
break;
- case ANYOF_NDIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
+ }
+ case ANYOF_PRINT:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+ break;
+ case ANYOF_NPRINT:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+ break;
+ case ANYOF_PUNCT:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+ break;
+ case ANYOF_NPUNCT:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+ break;
+ case ANYOF_PSXSPC:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixSpace, PL_XPosixSpace);
+ break;
+ case ANYOF_NPSXSPC:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixSpace, PL_XPosixSpace);
+ break;
+ case ANYOF_SPACE:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PerlSpace, PL_XPerlSpace);
+ break;
+ case ANYOF_NSPACE:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PerlSpace, PL_XPerlSpace);
+ break;
+ case ANYOF_UPPER: /* Same as LOWER, above */
+ case ANYOF_NUPPER:
+ {
+ SV* ascii_source;
+ SV* l1_source;
+ const char *Xname;
+
+ if (FOLD && ! LOC) {
+ ascii_source = PL_PosixAlpha;
+ l1_source = PL_L1Cased;
+ Xname = "Cased";
+ }
else {
- /* consecutive digits assumed */
- for (value = 0; value < '0'; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
- for (value = '9' + 1; value < 256; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
+ ascii_source = PL_PosixUpper;
+ l1_source = PL_L1PosixUpper;
+ Xname = "XPosixUpper";
}
- yesno = '!';
- what = "Digit";
- if (AT_LEAST_ASCII_RESTRICTED ) {
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+ if (namedclass == ANYOF_UPPER) {
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ ascii_source, l1_source, Xname, listsv);
}
- break;
+ else {
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
+ properties, ascii_source, l1_source, Xname, listsv);
+ }
+ break;
+ }
+ case ANYOF_ALNUM: /* Really is 'Word' */
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+ break;
+ case ANYOF_NALNUM:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+ break;
+ case ANYOF_VERTWS:
+ /* For these, we use the nonbitmap, as /d doesn't make a
+ * difference in what these match. There would be problems
+ * if these characters had folds other than themselves, as
+ * nonbitmap is subject to folding */
+ _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
+ break;
+ case ANYOF_NVERTWS:
+ _invlist_union_complement_2nd(nonbitmap,
+ PL_VertSpace, &nonbitmap);
+ break;
+ case ANYOF_XDIGIT:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixXDigit, PL_XPosixXDigit);
+ break;
+ case ANYOF_NXDIGIT:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixXDigit, PL_XPosixXDigit);
+ break;
case ANYOF_MAX:
/* this is to handle \p and \P */
break;
vFAIL("Invalid [::] class");
break;
}
- if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
- /* Strings such as "+utf8::isWord\n" */
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what);
- }
continue;
}
}
}
- /* Done with loop; set <nonbitmap> to not include any code points that
- * are in the bitmap */
+ /* Done with loop; remove any code points that are in the bitmap from
+ * <nonbitmap> */
if (change_invlist) {
- SV* keep_list = _new_invlist(2);
- _append_range_to_invlist(keep_list, max_cp_to_set + 1, UV_MAX);
- _invlist_intersection(nonbitmap, keep_list, &nonbitmap);
- SvREFCNT_dec(keep_list);
+ _invlist_subtract(nonbitmap,
+ (DEPENDS_SEMANTICS)
+ ? PL_ASCII
+ : PL_Latin1,
+ &nonbitmap);
}
/* If have completely emptied it, remove it completely */
* there should not be overlap unless is /d rules. */
_invlist_invert(nonbitmap);
+ /* Any swash can't be used as-is, because we've inverted things */
+ if (swash) {
+ SvREFCNT_dec(swash);
+ swash = NULL;
+ }
+
for (i = 0; i < 256; ++i) {
if (ANYOF_BITMAP_TEST(ret, i)) {
ANYOF_BITMAP_CLEAR(ret, i);
else {
/* There is no overlap for non-/d, so just delete anything
* below 256 */
- SV* keep_list = _new_invlist(2);
- _append_range_to_invlist(keep_list, 256, UV_MAX);
- _invlist_intersection(nonbitmap, keep_list, &nonbitmap);
- SvREFCNT_dec(keep_list);
+ _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
}
}
}
return ret;
}
-#undef _C_C_T_
/* reg_skipcomment()