/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
* a flag that indicates we need to override /d with /u as a result of
* something in the pattern. It should only be used in regards to calling
- * set_regex_charset() or get_regex_charse() */
+ * set_regex_charset() or get_regex_charset() */
#define REQUIRE_UNI_RULES(flagp, restart_retval) \
STMT_START { \
if (DEPENDS_SEMANTICS) { \
unsigned int i;
const U32 n = ARG(node);
bool new_node_has_latin1 = FALSE;
- const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
+ const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
? 0
: ANYOF_FLAGS(node);
}
/* Add in the points from the bit map */
- if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
+ if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (ANYOF_BITMAP_TEST(node, i)) {
unsigned int start = i++;
* another SSC or a regular ANYOF class. Can create false positives. */
SV* anded_cp_list;
- U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
+ U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
? 0
: ANYOF_FLAGS(and_with);
U8 anded_flags;
SV* ored_cp_list;
U8 ored_flags;
- U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
+ U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
? 0
: ANYOF_FLAGS(or_with);
populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
+ SvREFCNT_dec(invlist);
/* Make sure is clone-safe */
ssc->invlist = NULL;
case ANYOFH:
case ANYOFHb:
case ANYOFHr:
+ case ANYOFHs:
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
ssc_and(pRExC_state, data->start_class,
break;
}
+ case ANYOFR:
+ case ANYOFRb:
+ {
+ SV* cp_list = NULL;
+
+ cp_list = _add_range_to_invlist(cp_list,
+ ANYOFRbase(scan),
+ ANYOFRbase(scan) + ANYOFRdelta(scan));
+
+ if (flags & SCF_DO_STCLASS_OR) {
+ ssc_union(data->start_class, cp_list, invert);
+ }
+ else if (flags & SCF_DO_STCLASS_AND) {
+ ssc_intersection(data->start_class, cp_list, invert);
+ }
+
+ SvREFCNT_dec_NN(cp_list);
+ break;
+ }
+
case NPOSIXL:
invert = 1;
/* FALLTHROUGH */
#endif
+PERL_STATIC_INLINE UV
+S_invlist_lowest(SV* const invlist)
+{
+ /* Returns the lowest code point that matches an inversion list. This API
+ * has an ambiguity, as it returns 0 under either the lowest is actually
+ * 0, or if the list is empty. If this distinction matters to you, check
+ * for emptiness before calling this function */
+
+ UV len = _invlist_len(invlist);
+ UV *array;
+
+ PERL_ARGS_ASSERT_INVLIST_LOWEST;
+
+ if (len == 0) {
+ return 0;
+ }
+
+ array = invlist_array(invlist);
+
+ return array[0];
+}
+
STATIC SV *
S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
{
STRLEN len = 0;
UV ender = 0;
char *p;
- char *s;
+ char *s, *old_s = NULL, *old_old_s = NULL;
char *s0;
U32 max_string_len = 255;
U8 node_type = EXACT;
/* Assume the node will be fully used; the excess is given back at
- * the end. Under /i, leave enough extra room so that we won't
- * overflow the buffer when we fold a character which would end up
- * overflowing the node. We can't make any other length
- * assumptions, as a byte input sequence could shrink down. */
+ * the end. Under /i, we may need to temporarily add the fold of
+ * an extra character or two at the end to check for splitting
+ * multi-char folds, so allocate extra space for that. We can't
+ * make any other length assumptions, as a byte input sequence
+ * could shrink down. */
Ptrdiff_t current_string_nodes = STR_SZ(max_string_len
+ ((! FOLD)
? 0
- : 1 * ((UTF)
+ : 2 * ((UTF)
? UTF8_MAXBYTES_CASE
/* Max non-UTF-8 expansion is 2 */ : 2)));
bool next_is_quantifier;
char * oldp = NULL;
- char * old_oldp = NULL;
/* We can convert EXACTF nodes to EXACTFU if they contain only
* characters that match identically regardless of the target
p = RExC_parse;
len = 0;
s = s0;
+ node_type = EXACT;
+ oldp = NULL;
+ maybe_exactfu = FOLD && (DEPENDS_SEMANTICS || LOC);
+ maybe_SIMPLE = SIMPLE;
+ requires_utf8_target = FALSE;
+ has_ss = FALSE;
+ has_micro_sign = FALSE;
continue_parse:
* The exceptions override this */
Size_t added_len = 1;
- old_oldp = oldp;
oldp = p;
+ old_old_s = old_s;
+ old_s = s;
/* White space has already been ignored */
assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
}
}
}
- else {
-
- /* Here is non-UTF8. First, see if the character's
- * fold differs between /d and /u. */
- if (PL_fold[ender] != PL_fold_latin1[ender]) {
- maybe_exactfu = FALSE;
+ else { /* Here is non-UTF8. */
+
+ /* The fold will be one or (rarely) two characters.
+ * Check that there's room for at least a single one
+ * before setting any flags, etc. Because otherwise an
+ * overflowing character could cause a flag to be set
+ * even though it doesn't end up in this node. (For
+ * the two character fold, we check again, before
+ * setting any flags) */
+ if (UNLIKELY(len + 1 > max_string_len)) {
+ overflowed = TRUE;
+ break;
}
#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
|| UNICODE_DOT_DOT_VERSION > 0)
- /* On non-ancient Unicode versions, this includes the
- * multi-char fold SHARP S to 'ss' */
-
- if ( UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
- || ( isALPHA_FOLD_EQ(ender, 's')
- && len > 0
- && isALPHA_FOLD_EQ(*(s-1), 's')))
- {
- /* Here, we have one of the following:
- * a) a SHARP S. This folds to 'ss' only under
- * /u rules. If we are in that situation,
- * fold the SHARP S to 'ss'.
- * b) 'ss'. When under /u, there's nothing
- * special needed to be done here. The
- * previous iteration handled the first 's',
- * and this iteration will handle the second.
- * If, on the otherhand it's not /u, we have
- * to exclude the possibility of moving to /u,
- * so that we won't generate an unwanted
- * match, unless, at runtime, the target
- * string is in UTF-8.
- * */
+ /* On non-ancient Unicodes, check for the only possible
+ * multi-char fold */
+ if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+ /* This potential multi-char fold means the node
+ * can't be simple (because it could match more
+ * than a single char). And in some cases it will
+ * match 'ss', so set that flag */
+ maybe_SIMPLE = 0;
has_ss = TRUE;
- maybe_exactfu = FALSE; /* Can't generate an
- EXACTFU node (unless we
- already are in one) */
- if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
- maybe_SIMPLE = 0;
- if (node_type == EXACTFU) {
-
- if (UNLIKELY(len + 2 > max_string_len)) {
- overflowed = TRUE;
- break;
- }
-
- *(s++) = 's';
- /* Let the code below add in the extra 's'
- * */
- ender = 's';
- added_len = 2;
+ /* It can't change to be an EXACTFU (unless already
+ * is one). We fold it iff under /u rules. */
+ if (node_type != EXACTFU) {
+ maybe_exactfu = FALSE;
+ }
+ else {
+ if (UNLIKELY(len + 2 > max_string_len)) {
+ overflowed = TRUE;
+ break;
}
+
+ *(s++) = 's';
+ *(s++) = 's';
+ added_len = 2;
+
+ goto done_with_this_char;
}
}
+ else if ( UNLIKELY(isALPHA_FOLD_EQ(ender, 's'))
+ && LIKELY(len > 0)
+ && UNLIKELY(isALPHA_FOLD_EQ(*(s-1), 's')))
+ {
+ /* Also, the sequence 'ss' is special when not
+ * under /u. If the target string is UTF-8, it
+ * should match SHARP S; otherwise it won't. So,
+ * here we have to exclude the possibility of this
+ * node moving to /u.*/
+ has_ss = TRUE;
+ maybe_exactfu = FALSE;
+ }
#endif
+ /* Here, the fold will be a single character */
- else if (UNLIKELY(ender == MICRO_SIGN)) {
+ if (UNLIKELY(ender == MICRO_SIGN)) {
has_micro_sign = TRUE;
}
+ else if (PL_fold[ender] != PL_fold_latin1[ender]) {
- if (UNLIKELY(len + 1 > max_string_len)) {
- overflowed = TRUE;
- break;
+ /* If the character's fold differs between /d and
+ * /u, this can't change to be an EXACTFU node */
+ maybe_exactfu = FALSE;
}
*(s++) = (DEPENDS_SEMANTICS)
}
} /* End of adding current character to the node */
+ done_with_this_char:
+
len += added_len;
if (next_is_quantifier) {
goto continue_parse;
}
- else if (! LOC) { /* XXX shouldn't /l assume could be a UTF-8
- locale, and prepare for that? */
+ else if (FOLD) {
+ bool splittable = FALSE;
+ bool backed_up = FALSE;
+ char * e;
+ char * s_start;
/* Here is /i. Running out of room creates a problem if we are
* folding, and the split happens in the middle of a
* things that fold to them) as 'ff' and 'ss' are
* multi-character folds.
*
+ * The Unicode standard says that multi character folds consist
+ * of either two or three characters. That means we would be
+ * splitting one if the final character in the node is at the
+ * beginning of either type, or is the second of a three
+ * character fold.
+ *
* At this point:
- * old_oldp points to the beginning in the input of the
- * penultimate character in the node.
- * oldp points to the beginning in the input of the
- * final character in the node.
- * p points to the beginning in the input of the
- * next character in the input, the one that won't
- * fit in the node.
+ * ender is the code point of the character that won't fit
+ * in the node
+ * s points to just beyond the final byte in the node.
+ * It's where we would place ender if there were
+ * room, and where in fact we do place ender's fold
+ * in the code below, as we've over-allocated space
+ * for s0 (hence s) to allow for this
+ * e starts at 's' and advances as we append things.
+ * old_s is the same as 's'. (If ender had fit, 's' would
+ * have been advanced to beyond it).
+ * old_old_s points to the beginning byte of the final
+ * character in the node
+ * p points to the beginning byte in the input of the
+ * character beyond 'ender'.
+ * oldp points to the beginning byte in the input of
+ * 'ender'.
*
- * We aren't in the middle of a multi-char fold unless the
- * final character in the node can appear in a non-final
- * position in such a fold. Very few characters actually
- * participate in multi-character folds, and fewer still can be
- * in the non-final position. But it's complicated to know
- * here if that final character is folded or not, so skip this
- * check */
-
- /* Make sure enough space for final char of node,
- * first char of following node, and the fold of the
- * following char (so we don't have to worry about
- * that fold running off the end */
- U8 foldbuf[UTF8_MAXBYTES_CASE * 5 + 1];
- STRLEN fold_len;
- UV folded;
- char * const sav_oldp = oldp;
-
- assert(FOLD);
-
- /* The Unicode standard says that multi character folds consist
- * of either two or three characters. So we create a buffer
- * containing a window of three. The first is the final
- * character in the node (folded), and then the two that begin
- * the following node. But if the first character of the
- * following node can't be in a non-final fold position, there
- * is no need to look at its successor character. The macros
- * used below to check for multi character folds require folded
- * inputs, so we have to fold these. (The fold of p was likely
- * calculated in the loop above, but it hasn't beeen saved, and
- * khw thinks it would be too entangled to change to do so) */
-
- if (UTF || LIKELY(UCHARAT(p) != MICRO_SIGN)) {
- folded = _to_uni_fold_flags(ender,
- foldbuf,
- &fold_len,
- FOLD_FLAGS_FULL);
+ * In the case of /il, we haven't folded anything that could be
+ * affected by the locale. That means only above-Latin1
+ * characters that fold to other above-latin1 characters get
+ * folded at compile time. To check where a good place to
+ * split nodes is, everything in it will have to be folded.
+ * The boolean 'maybe_exactfu' keeps track in /il if there are
+ * any unfolded characters in the node. */
+ bool need_to_fold_loc = LOC && ! maybe_exactfu;
+
+ /* If we do need to fold the node, we need a place to store the
+ * folded copy, and a way to map back to the unfolded original
+ * */
+ char * locfold_buf;
+ Size_t * loc_correspondence;
+
+ if (! need_to_fold_loc) { /* The normal case. Just
+ initialize to the actual node */
+ e = s;
+ s_start = s0;
+ s = old_old_s; /* Point to the beginning of the final char
+ that fits in the node */
}
else {
- foldbuf[0] = folded = MICRO_SIGN;
- fold_len = 1;
- }
-
- /* Here, foldbuf contains the fold of the first character in
- * the next node. We may also need the next one (if there is
- * one) to get our third, but if the first character folded to
- * more than one, those extra one(s) will serve as the third.
- * Also, we don't need a third unless the previous one can
- * appear in a non-final position in a fold */
- if ( ((RExC_end - p) > ((UTF) ? UVCHR_SKIP(ender) : 1))
- && (fold_len == 1 || ( UTF
- && UVCHR_SKIP(folded) == fold_len))
- && UNLIKELY(_invlist_contains_cp(PL_NonFinalFold, folded)))
- {
- if (UTF) {
- STRLEN next_fold_len;
- toFOLD_utf8_safe((U8*) p + UTF8SKIP(p),
- (U8*) RExC_end, foldbuf + fold_len,
- &next_fold_len);
- fold_len += next_fold_len;
- }
- else {
- if (UNLIKELY(p[1] == LATIN_SMALL_LETTER_SHARP_S)) {
- foldbuf[fold_len] = 's';
+ /* Here, we have filled a /il node, and there are unfolded
+ * characters in it. If the runtime locale turns out to be
+ * UTF-8, there are possible multi-character folds, just
+ * like when not under /l. The node hence can't terminate
+ * in the middle of such a fold. To determine this, we
+ * have to create a folded copy of this node. That means
+ * reparsing the node, folding everything assuming a UTF-8
+ * locale. (If at runtime it isn't such a locale, the
+ * actions here wouldn't have been necessary, but we have
+ * to assume the worst case.) If we find we need to back
+ * off the folded string, we do so, and then map that
+ * position back to the original unfolded node, which then
+ * gets output, truncated at that spot */
+
+ char * redo_p = RExC_parse;
+ char * redo_e;
+ char * old_redo_e;
+
+ /* Allow enough space assuming a single byte input folds to
+ * a single byte output, plus assume that the two unparsed
+ * characters (that we may need) fold to the largest number
+ * of bytes possible, plus extra for one more worst case
+ * scenario. In the loop below, if we start eating into
+ * that final spare space, we enlarge this initial space */
+ Size_t size = max_string_len + (3 * UTF8_MAXBYTES_CASE) + 1;
+
+ Newxz(locfold_buf, size, char);
+ Newxz(loc_correspondence, size, Size_t);
+
+ /* Redo this node's parse, folding into 'locfold_buf' */
+ redo_p = RExC_parse;
+ redo_e = locfold_buf;
+ while (redo_p <= oldp) {
+
+ old_redo_e = redo_e;
+ loc_correspondence[redo_e - locfold_buf]
+ = redo_p - RExC_parse;
+
+ if (UTF) {
+ Size_t added_len;
+
+ (void) _to_utf8_fold_flags((U8 *) redo_p,
+ (U8 *) RExC_end,
+ (U8 *) redo_e,
+ &added_len,
+ FOLD_FLAGS_FULL);
+ redo_e += added_len;
+ redo_p += UTF8SKIP(redo_p);
}
else {
- foldbuf[fold_len] = toLOWER_L1(p[1]);
+
+ /* Note that if this code is run on some ancient
+ * Unicode versions, SHARP S doesn't fold to 'ss',
+ * but rather than clutter the code with #ifdef's,
+ * as is done above, we ignore that possibility.
+ * This is ok because this code doesn't affect what
+ * gets matched, but merely where the node gets
+ * split */
+ if (UCHARAT(redo_p) != LATIN_SMALL_LETTER_SHARP_S) {
+ *redo_e++ = toLOWER_L1(UCHARAT(redo_p));
+ }
+ else {
+ *redo_e++ = 's';
+ *redo_e++ = 's';
+ }
+ redo_p++;
+ }
+
+
+ /* If we're getting so close to the end that a
+ * worst-case fold in the next character would cause us
+ * to overflow, increase, assuming one byte output byte
+ * per one byte input one, plus room for another worst
+ * case fold */
+ if ( redo_p <= oldp
+ && redo_e > locfold_buf + size
+ - (UTF8_MAXBYTES_CASE + 1))
+ {
+ Size_t new_size = size
+ + (oldp - redo_p)
+ + UTF8_MAXBYTES_CASE + 1;
+ Ptrdiff_t e_offset = redo_e - locfold_buf;
+
+ Renew(locfold_buf, new_size, char);
+ Renew(loc_correspondence, new_size, Size_t);
+ size = new_size;
+
+ redo_e = locfold_buf + e_offset;
}
- fold_len++;
}
+
+ /* Set so that things are in terms of the folded, temporary
+ * string */
+ s = old_redo_e;
+ s_start = locfold_buf;
+ e = redo_e;
+
}
- /* Here foldbuf contains the the fold of p, and if appropriate
- * that of the character following p in the input. */
+ /* Here, we have 's', 's_start' and 'e' set up to point to the
+ * input that goes into the node, folded.
+ *
+ * If the final character of the node and the fold of ender
+ * form the first two characters of a three character fold, we
+ * need to peek ahead at the next (unparsed) character in the
+ * input to determine if the three actually do form such a
+ * fold. Just looking at that character is not generally
+ * sufficient, as it could be, for example, an escape sequence
+ * that evaluates to something else, and it needs to be folded.
+ *
+ * khw originally thought to just go through the parse loop one
+ * extra time, but that doesn't work easily as that iteration
+ * could cause things to think that the parse is over and to
+ * goto loopdone. The character could be a '$' for example, or
+ * the character beyond could be a quantifier, and other
+ * glitches as well.
+ *
+ * The solution used here for peeking ahead is to look at that
+ * next character. If it isn't ASCII punctuation, then it will
+ * be something that continues in an EXACTish node if there
+ * were space. We append the fold of it to s, having reserved
+ * enough room in s0 for the purpose. If we can't reasonably
+ * peek ahead, we instead assume the worst case: that it is
+ * something that would form the completion of a multi-char
+ * fold.
+ *
+ * If we can't split between s and ender, we work backwards
+ * character-by-character down to s0. At each current point
+ * see if we are at the beginning of a multi-char fold. If so,
+ * that means we would be splitting the fold across nodes, and
+ * so we back up one and try again.
+ *
+ * If we're not at the beginning, we still could be at the
+ * final two characters of a (rare) three character fold. We
+ * check if the sequence starting at the character before the
+ * current position (and including the current and next
+ * characters) is a three character fold. If not, the node can
+ * be split here. If it is, we have to backup two characters
+ * and try again.
+ *
+ * Otherwise, the node can be split at the current position.
+ *
+ * The same logic is used for UTF-8 patterns and not */
+ if (UTF) {
+ Size_t added_len;
+
+ /* Append the fold of ender */
+ (void) _to_uni_fold_flags(
+ ender,
+ (U8 *) e,
+ &added_len,
+ FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+ e += added_len;
+
+ /* 's' and the character folded to by ender may be the
+ * first two of a three-character fold, in which case the
+ * node should not be split here. That may mean examining
+ * the so-far unparsed character starting at 'p'. But if
+ * ender folded to more than one character, we already have
+ * three characters to look at. Also, we first check if
+ * the sequence consisting of s and the next character form
+ * the first two of some three character fold. If not,
+ * there's no need to peek ahead. */
+ if ( added_len <= UTF8SKIP(e - added_len)
+ && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_utf8_safe(s, e)))
+ {
+ /* Here, the two do form the beginning of a potential
+ * three character fold. The unexamined character may
+ * or may not complete it. Peek at it. It might be
+ * something that ends the node or an escape sequence,
+ * in which case we don't know without a lot of work
+ * what it evaluates to, so we have to assume the worst
+ * case: that it does complete the fold, and so we
+ * can't split here. All such instances will have
+ * that character be an ASCII punctuation character,
+ * like a backslash. So, for that case, backup one and
+ * drop down to try at that position */
+ if (isPUNCT(*p)) {
+ s = (char *) utf8_hop_back((U8 *) s, -1,
+ (U8 *) s_start);
+ backed_up = TRUE;
+ }
+ else {
+ /* Here, since it's not punctuation, it must be a
+ * real character, and we can append its fold to
+ * 'e' (having deliberately reserved enough space
+ * for this eventuality) and drop down to check if
+ * the three actually do form a folded sequence */
+ (void) _to_utf8_fold_flags(
+ (U8 *) p, (U8 *) RExC_end,
+ (U8 *) e,
+ &added_len,
+ FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+ e += added_len;
+ }
+ }
- /* Search backwards until find a place that doesn't split a
- * multi-char fold */
- while (1) {
- STRLEN s_len;
- char s_fold_buf[UTF8_MAXBYTES_CASE];
- char * s_fold = s_fold_buf;
+ /* Here, we either have three characters available in
+ * sequence starting at 's', or we have two characters and
+ * know that the following one can't possibly be part of a
+ * three character fold. We go through the node backwards
+ * until we find a place where we can split it without
+ * breaking apart a multi-character fold. At any given
+ * point we have to worry about if such a fold begins at
+ * the current 's', and also if a three-character fold
+ * begins at s-1, (containing s and s+1). Splitting in
+ * either case would break apart a fold */
+ do {
+ char *prev_s = (char *) utf8_hop_back((U8 *) s, -1,
+ (U8 *) s_start);
+
+ /* If is a multi-char fold, can't split here. Backup
+ * one char and try again */
+ if (UNLIKELY(is_MULTI_CHAR_FOLD_utf8_safe(s, e))) {
+ s = prev_s;
+ backed_up = TRUE;
+ continue;
+ }
- if (s <= s0) {
+ /* If the two characters beginning at 's' are part of a
+ * three character fold starting at the character
+ * before s, we can't split either before or after s.
+ * Backup two chars and try again */
+ if ( LIKELY(s > s_start)
+ && UNLIKELY(is_THREE_CHAR_FOLD_utf8_safe(prev_s, e)))
+ {
+ s = prev_s;
+ s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s_start);
+ backed_up = TRUE;
+ continue;
+ }
- /* There's no safe place in the node to split. Quit so
- * will take the whole node */
- oldp = sav_oldp;
+ /* Here there's no multi-char fold between s and the
+ * next character following it. We can split */
+ splittable = TRUE;
break;
- }
- /* Backup 1 character. The first time through this moves s
- * to point to the final character in the node */
- if (UTF) {
- s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+ } while (s > s_start); /* End of loops backing up through the node */
+
+ /* Here we either couldn't find a place to split the node,
+ * or else we broke out of the loop setting 'splittable' to
+ * true. In the latter case, the place to split is between
+ * the first and second characters in the sequence starting
+ * at 's' */
+ if (splittable) {
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Pattern not UTF-8 */
+ if ( ender != LATIN_SMALL_LETTER_SHARP_S
+ || ASCII_FOLD_RESTRICTED)
+ {
+ *e++ = toLOWER_L1(ender);
}
else {
- s--;
+ *e++ = 's';
+ *e++ = 's';
}
- /* 's' may or may not be folded; so make sure it is, and
- * use just the final character in its fold (should there
- * be more than one */
- if (UTF) {
- toFOLD_utf8_safe((U8*) s,
- (U8*) s + UTF8SKIP(s),
- (U8 *) s_fold_buf, &s_len);
- while (s_fold + UTF8SKIP(s_fold) < s_fold_buf + s_len)
- {
- s_fold += UTF8SKIP(s_fold);
+ if ( e - s <= 1
+ && UNLIKELY(is_THREE_CHAR_FOLD_HEAD_latin1_safe(s, e)))
+ {
+ if (isPUNCT(*p)) {
+ s--;
+ backed_up = TRUE;
+ }
+ else {
+ if ( UCHARAT(p) != LATIN_SMALL_LETTER_SHARP_S
+ || ASCII_FOLD_RESTRICTED)
+ {
+ *e++ = toLOWER_L1(ender);
+ }
+ else {
+ *e++ = 's';
+ *e++ = 's';
+ }
}
- s_len = UTF8SKIP(s_fold);
}
- else {
- if (UNLIKELY(UCHARAT(s) == LATIN_SMALL_LETTER_SHARP_S))
- {
- s_fold_buf[0] = 's';
+
+ do {
+ if (UNLIKELY(is_MULTI_CHAR_FOLD_latin1_safe(s, e))) {
+ s--;
+ backed_up = TRUE;
+ continue;
}
- else { /* This works for all other non-UTF-8 folds
- */
- s_fold_buf[0] = toLOWER_L1(UCHARAT(s));
+
+ if ( LIKELY(s > s_start)
+ && UNLIKELY(is_THREE_CHAR_FOLD_latin1_safe(s - 1, e)))
+ {
+ s -= 2;
+ backed_up = TRUE;
+ continue;
}
- s_len = 1;
+
+ splittable = TRUE;
+ break;
+
+ } while (s > s_start);
+
+ if (splittable) {
+ s++;
}
+ }
- /* Unshift this character to the beginning of the buffer,
- * No longer needed trailing characters are overwritten.
- * */
- Move(foldbuf, foldbuf + s_len, sizeof(foldbuf) - s_len, U8);
- Copy(s_fold, foldbuf, s_len, U8);
-
- /* If this isn't a multi-character fold, we have found a
- * splittable place. If this is the final character in the
- * node, that means the node is valid as-is, and can quit.
- * Otherwise, we note how much we can fill the node before
- * coming to a non-splittable position, and go parse it
- * again, stopping there. This is done because we know
- * where in the output to stop, but we don't have a map to
- * where that is in the input. One could be created, but
- * it seems like overkill for such a rare event as we are
- * dealing with here */
- if (UTF) {
- if (! is_MULTI_CHAR_FOLD_utf8_safe(foldbuf,
- foldbuf + UTF8_MAXBYTES_CASE))
- {
- upper_fill = s + UTF8SKIP(s) - s0;
- if (LIKELY(oldp)) {
- break;
+ /* Here, we are done backing up. If we didn't backup at all
+ * (the likely case), just proceed */
+ if (backed_up) {
+
+ /* If we did find a place to split, reparse the entire node
+ * stopping where we have calculated. */
+ if (splittable) {
+
+ /* If we created a temporary folded string under /l, we
+ * have to map that back to the original */
+ if (need_to_fold_loc) {
+ upper_fill = loc_correspondence[s - s_start];
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
+
+ if (upper_fill == 0) {
+ FAIL2("panic: loc_correspondence[%d] is 0",
+ (int) (s - s_start));
}
- goto reparse;
}
- }
- else if (! is_MULTI_CHAR_FOLD_latin1_safe(foldbuf,
- foldbuf + UTF8_MAXBYTES_CASE))
- {
- upper_fill = s + 1 - s0;
- if (LIKELY(oldp)) {
- break;
+ else {
+ upper_fill = s - s0;
}
goto reparse;
}
+ else if (need_to_fold_loc) {
+ Safefree(locfold_buf);
+ Safefree(loc_correspondence);
+ }
- oldp = old_oldp;
- old_oldp = NULL;
-
- } /* End of loop backing up through the node */
/* Here the node consists entirely of non-final multi-char
* folds. (Likely it is all 'f's or all 's's.) There's no
* decent place to split it, so give up and just take the
* whole thing */
-
+ len = old_s - s0;
+ }
} /* End of verifying node ends with an appropriate char */
- p = oldp;
+ /* We need to start the next node at the character that didn't fit
+ * in this one */
+ p = oldp;
loopdone: /* Jumped to when encounters something that shouldn't be
in the node */
assert(PL_regkind[OP(node)] == ANYOF);
/* There is no bitmap for this node type */
- if (inRANGE(OP(node), ANYOFH, ANYOFHr)) {
+ if (inRANGE(OP(node), ANYOFH, ANYOFRb)) {
return;
}
well have generated non-portable code points, but
they're valid on this machine */
FALSE, /* similarly, no need for strict */
- FALSE, /* Require return to be an ANYOF */
+
+ /* We can optimize into something besides an ANYOF, except
+ * under /l, which needs to be ANYOF because of runtime
+ * checks for locale sanity, etc */
+ ! in_locale,
NULL
);
UPDATE_WARNINGS_LOC(RExC_parse);
}
+Size_t PERL_STATIC_INLINE
+S_find_first_differing_byte_pos(const U8 * s1, const U8 * s2, const Size_t max)
+{
+ const U8 * const start = s1;
+ const U8 * const send = start + max;
+
+ PERL_ARGS_ASSERT_FIND_FIRST_DIFFERING_BYTE_POS;
+
+ while (s1 < send && *s1 == *s2) {
+ s1++; s2++;
+ }
+
+ return s1 - start;
+}
+
+
STATIC AV *
S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
{
|= ANYOFL_FOLD
| ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
}
- else if (cp_list) { /* Look to see if a 0-255 code point is in list */
- UV start, end;
- invlist_iterinit(cp_list);
- if (invlist_iternext(cp_list, &start, &end) && start < 256) {
- anyof_flags |= ANYOFL_FOLD;
- has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
- }
- invlist_iterfinish(cp_list);
+ else if (cp_list && invlist_lowest(cp_list) < 256) {
+ /* If nothing is below 256, has no locale dependency; otherwise it
+ * does */
+ anyof_flags |= ANYOFL_FOLD;
+ has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
}
}
else if ( DEPENDS_SEMANTICS
if (optimizable) {
PERL_UINT_FAST8_T i;
- Size_t partial_cp_count = 0;
+ UV partial_cp_count = 0;
UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
UV end[MAX_FOLD_FROMS+1] = { 0 };
+ bool single_range = FALSE;
if (cp_list) { /* Count the code points in enough ranges that we would
see all the ones possible in any fold in this version
partial_cp_count += end[i] - start[i] + 1;
}
+ if (i == 1) {
+ single_range = TRUE;
+ }
invlist_iterfinish(cp_list);
}
* participates in no fold whatsoever, and having it EXACT tells the
* optimizer the target string cannot match unless it has a colon in
* it.
- *
- * We don't typically generate an EXACTish node if doing so would
- * require changing the pattern to UTF-8, as that affects /d and
- * otherwise is slower. However, under /i, not changing to UTF-8 can
- * miss some potential multi-character folds. We calculate the
- * EXACTish node, and then decide if something would be missed if we
- * don't upgrade */
+ */
if ( ! posixl
&& ! invert
/* Only try if there are no more code points in the class than
* in the max possible fold */
- && partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1
-
- && (start[0] < 256 || UTF || FOLD))
+ && partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1)
{
if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
{
if (LOC) {
- /* Here is /l: Use EXACTL, except /li indicates EXACTFL,
- * as that means there is a fold not known until runtime so
- * shows as only a single code point here. */
- op = (FOLD) ? EXACTFL : EXACTL;
+ /* Here is /l: Use EXACTL, except if there is a fold not
+ * known until runtime so shows as only a single code point
+ * here. For code points above 255, we know which can
+ * cause problems by having a potential fold to the Latin1
+ * range. */
+ if ( ! FOLD
+ || ( start[0] > 255
+ && ! is_PROBLEMATIC_LOCALE_FOLD_cp(start[0])))
+ {
+ op = EXACTL;
+ }
+ else {
+ op = EXACTFL;
+ }
}
else if (! FOLD) { /* Not /l and not /i */
op = (start[0] < 256) ? EXACT : EXACT_REQ8;
}
if (op != END) {
+ U8 len;
- /* Here, we have calculated what EXACTish node we would use.
- * But we don't use it if it would require converting the
- * pattern to UTF-8, unless not using it could cause us to miss
- * some folds (hence be buggy) */
-
- if (! UTF && value > 255) {
- SV * in_multis = NULL;
-
- assert(FOLD);
-
- /* If there is no code point that is part of a multi-char
- * fold, then there aren't any matches, so we don't do this
- * optimization. Otherwise, it could match depending on
- * the context around us, so we do upgrade */
- _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis);
- if (UNLIKELY(_invlist_len(in_multis) != 0)) {
+ /* Here, we have calculated what EXACTish node to use. Have to
+ * convert to UTF-8 if not already there */
+ if (value > 255) {
+ if (! UTF) {
+ SvREFCNT_dec(cp_list);;
REQUIRE_UTF8(flagp);
}
- else {
- op = END;
+
+ /* This is a kludge to the special casing issues with this
+ * ligature under /aa. FB05 should fold to FB06, but the
+ * call above to _to_uni_fold_flags() didn't find this, as
+ * it didn't use the /aa restriction in order to not miss
+ * other folds that would be affected. This is the only
+ * instance likely to ever be a problem in all of Unicode.
+ * So special case it. */
+ if ( value == LATIN_SMALL_LIGATURE_LONG_S_T
+ && ASCII_FOLD_RESTRICTED)
+ {
+ value = LATIN_SMALL_LIGATURE_ST;
}
}
- if (op != END) {
- U8 len = (UTF) ? UVCHR_SKIP(value) : 1;
+ len = (UTF) ? UVCHR_SKIP(value) : 1;
- ret = regnode_guts(pRExC_state, op, len, "exact");
- FILL_NODE(ret, op);
- RExC_emit += 1 + STR_SZ(len);
- setSTR_LEN(REGNODE_p(ret), len);
- if (len == 1) {
- *STRING(REGNODE_p(ret)) = (U8) value;
- }
- else {
- uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
- }
- goto not_anyof;
+ ret = regnode_guts(pRExC_state, op, len, "exact");
+ FILL_NODE(ret, op);
+ RExC_emit += 1 + STR_SZ(len);
+ setSTR_LEN(REGNODE_p(ret), len);
+ if (len == 1) {
+ *STRINGs(REGNODE_p(ret)) = (U8) value;
+ }
+ else {
+ uvchr_to_utf8((U8 *) STRINGs(REGNODE_p(ret)), value);
}
+ goto not_anyof;
}
}
SvREFCNT_dec(intersection);
}
+ /* If it is a single contiguous range, ANYOFR is an efficient regnode,
+ * both in size and speed. Currently, a 20 bit range base (smallest
+ * code point in the range), and a 12 bit maximum delta are packed into
+ * a 32 bit word. This allows for using it on all of the Unicode code
+ * points except for the highest plane, which is only for private use
+ * code points. khw doubts that a bigger delta is likely in real world
+ * applications */
+ if ( single_range
+ && ! has_runtime_dependency
+ && anyof_flags == 0
+ && start[0] < (1 << ANYOFR_BASE_BITS)
+ && end[0] - start[0]
+ < ((1U << (sizeof(((struct regnode_1 *)NULL)->arg1)
+ * CHARBITS - ANYOFR_BASE_BITS))))
+
+ {
+ U8 low_utf8[UTF8_MAXBYTES+1];
+ U8 high_utf8[UTF8_MAXBYTES+1];
+
+ ret = reganode(pRExC_state, ANYOFR,
+ (start[0] | (end[0] - start[0]) << ANYOFR_BASE_BITS));
+
+ /* Place the lowest UTF-8 start byte in the flags field, so as to
+ * allow efficient ruling out at run time of many possible inputs.
+ * */
+ (void) uvchr_to_utf8(low_utf8, start[0]);
+ (void) uvchr_to_utf8(high_utf8, end[0]);
+
+ /* If all code points share the same first byte, this can be an
+ * ANYOFRb. Otherwise store the lowest UTF-8 start byte which can
+ * quickly rule out many inputs at run-time without having to
+ * compute the code point from UTF-8. For EBCDIC, we use I8, as
+ * not doing that transformation would not rule out nearly so many
+ * things */
+ if (low_utf8[0] == high_utf8[0]) {
+ OP(REGNODE_p(ret)) = ANYOFRb;
+ ANYOF_FLAGS(REGNODE_p(ret)) = low_utf8[0];
+ }
+ else {
+ ANYOF_FLAGS(REGNODE_p(ret))
+ = NATIVE_UTF8_TO_I8(low_utf8[0]);
+ }
+
+ goto not_anyof;
+ }
+
/* If didn't find an optimization and there is no need for a bitmap,
* optimize to indicate that */
if ( start[0] >= NUM_ANYOF_CODE_POINTS
U8 low_utf8[UTF8_MAXBYTES+1];
UV highest_cp = invlist_highest(cp_list);
- op = ANYOFH;
-
/* Currently the maximum allowed code point by the system is
* IV_MAX. Higher ones are reserved for future internal use. This
* particular regnode can be used for higher ones, but we can't
* calculate the code point of those. IV_MAX suffices though, as
* it will be a large first byte */
- (void) uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX));
+ Size_t low_len = uvchr_to_utf8(low_utf8, MIN(start[0], IV_MAX))
+ - low_utf8;
/* We store the lowest possible first byte of the UTF-8
* representation, using the flags field. This allows for quick
* transformation would not rule out nearly so many things */
anyof_flags = NATIVE_UTF8_TO_I8(low_utf8[0]);
+ op = ANYOFH;
+
/* If the first UTF-8 start byte for the highest code point in the
* range is suitably small, we may be able to get an upper bound as
* well */
if (highest_cp <= IV_MAX) {
U8 high_utf8[UTF8_MAXBYTES+1];
-
- (void) uvchr_to_utf8(high_utf8, highest_cp);
+ Size_t high_len = uvchr_to_utf8(high_utf8, highest_cp)
+ - high_utf8;
/* If the lowest and highest are the same, we can get an exact
- * first byte instead of a just minimum. We signal this with a
- * different regnode */
+ * first byte instead of a just minimum or even a sequence of
+ * exact leading bytes. We signal these with different
+ * regnodes */
if (low_utf8[0] == high_utf8[0]) {
+ Size_t len = find_first_differing_byte_pos(low_utf8,
+ high_utf8,
+ MIN(low_len, high_len));
- /* No need to convert to I8 for EBCDIC as this is an exact
- * match */
- anyof_flags = low_utf8[0];
- op = ANYOFHb;
+ if (len == 1) {
+
+ /* No need to convert to I8 for EBCDIC as this is an
+ * exact match */
+ anyof_flags = low_utf8[0];
+ op = ANYOFHb;
+ }
+ else {
+ op = ANYOFHs;
+ ret = regnode_guts(pRExC_state, op,
+ regarglen[op] + STR_SZ(len),
+ "anyofhs");
+ FILL_NODE(ret, op);
+ RExC_emit += 1 + regarglen[op]
+ - 1 + STR_SZ(len); /* Replace the [1]
+ element of the struct
+ by the real value */
+ REGNODE_p(ret)->flags = len;
+ Copy(low_utf8, /* Add the common bytes */
+ ((struct regnode_anyofhs *) REGNODE_p(ret))->string,
+ len, U8);
+ NEXT_OFF(REGNODE_p(ret)) = regarglen[op] + STR_SZ(len);
+ set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
+ NULL, only_utf8_locale_list);
+ goto not_anyof;
+ }
}
else if (NATIVE_UTF8_TO_I8(high_utf8[0]) <= MAX_ANYOF_HRx_BYTE)
{
set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list,
(HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
- ? listsv : NULL,
+ ? listsv
+ : NULL,
only_utf8_locale_list);
+ SvREFCNT_dec(cp_list);;
+ SvREFCNT_dec(only_utf8_locale_list);
return ret;
not_anyof:
Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
RExC_parse - orig_parse);;
SvREFCNT_dec(cp_list);;
+ SvREFCNT_dec(only_utf8_locale_list);
return ret;
}
SV *rv;
if (cp_list) {
- av_store(av, INVLIST_INDEX, cp_list);
+ av_store(av, INVLIST_INDEX, SvREFCNT_inc(cp_list));
}
if (only_utf8_locale_list) {
- av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list);
+ av_store(av, ONLY_LOCALE_MATCHES_INDEX,
+ SvREFCNT_inc(only_utf8_locale_list));
}
if (runtime_defns) {
S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
{
/* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode
- * equivalents space. It aligns and increments RExC_size and RExC_emit
+ * equivalents space. It aligns and increments RExC_size
*
* It returns the regnode's offset into the regex engine program */
NULL,
NULL,
NULL,
+ 0,
FALSE
);
sv_catpvs(sv, "]");
else if (k == LOGICAL)
/* 2: embedded, otherwise 1 */
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
- else if (k == ANYOF) {
- const U8 flags = inRANGE(OP(o), ANYOFH, ANYOFHr)
- ? 0
- : ANYOF_FLAGS(o);
+ else if (k == ANYOF || k == ANYOFR) {
+ U8 flags;
+ char * bitmap;
+ U32 arg;
bool do_sep = FALSE; /* Do we need to separate various components of
the output? */
/* Set if there is still an unresolved user-defined property */
/* And things that aren't in the bitmap, but are small enough to be */
SV* bitmap_range_not_in_bitmap = NULL;
- const bool inverted = flags & ANYOF_INVERT;
+ bool inverted;
+
+ if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
+ flags = 0;
+ bitmap = NULL;
+ arg = 0;
+ }
+ else {
+ flags = ANYOF_FLAGS(o);
+ bitmap = ANYOF_BITMAP(o);
+ arg = ARG(o);
+ }
if (OP(o) == ANYOFL || OP(o) == ANYOFPOSIXL) {
if (ANYOFL_UTF8_LOCALE_REQD(flags)) {
}
}
+ inverted = flags & ANYOF_INVERT;
+
/* If there is stuff outside the bitmap, get it */
- if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
- (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
+ if (arg != ANYOF_ONLY_HAS_BITMAP) {
+ if (inRANGE(OP(o), ANYOFR, ANYOFRb)) {
+ nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist,
+ ANYOFRbase(o),
+ ANYOFRbase(o) + ANYOFRdelta(o));
+ }
+ else {
+ (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
&unresolved,
&only_utf8_locale_invlist,
&nonbitmap_invlist);
+ }
+
/* The non-bitmap data may contain stuff that could fit in the
* bitmap. This could come from a user-defined property being
* finally resolved when this call was done; or much more likely
* because there are matches that require UTF-8 to be valid, and so
- * aren't in the bitmap. This is teased apart later */
+ * aren't in the bitmap (or ANYOFR). This is teased apart later */
_invlist_intersection(nonbitmap_invlist,
PL_InBitmap,
&bitmap_range_not_in_bitmap);
/* Ready to start outputting. First, the initial left bracket */
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
- if (! inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+ /* ANYOFH by definition doesn't have anything that will fit inside the
+ * bitmap; ANYOFR may or may not. */
+ if ( ! inRANGE(OP(o), ANYOFH, ANYOFHr)
+ && ( ! inRANGE(OP(o), ANYOFR, ANYOFRb)
+ || ANYOFRbase(o) < NUM_ANYOF_CODE_POINTS))
+ {
/* Then all the things that could fit in the bitmap */
do_sep = put_charclass_bitmap_innards(sv,
- ANYOF_BITMAP(o),
+ bitmap,
bitmap_range_not_in_bitmap,
only_utf8_locale_invlist,
o,
+ flags,
/* Can't try inverting for a
* better display if there
* are things that haven't
* been resolved */
- unresolved != NULL);
+ unresolved != NULL
+ || inRANGE(OP(o), ANYOFR, ANYOFRb));
SvREFCNT_dec(bitmap_range_not_in_bitmap);
/* If there are user-defined properties which haven't been defined
/* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
- if (inRANGE(OP(o), ANYOFH, ANYOFHr)) {
+ if (OP(o) == ANYOFHs) {
+ Perl_sv_catpvf(aTHX_ sv, " (Leading UTF-8 bytes=%s", _byte_dump_string((U8 *) ((struct regnode_anyofhs *) o)->string, FLAGS(o), 1));
+ }
+ else if (inRANGE(OP(o), ANYOFH, ANYOFRb)) {
U8 lowest = (OP(o) != ANYOFHr)
? FLAGS(o)
: LOWEST_ANYOF_HRx_BYTE(FLAGS(o));
- U8 highest = (OP(o) == ANYOFHb)
- ? lowest
- : OP(o) == ANYOFH
+ U8 highest = (OP(o) == ANYOFHr)
+ ? HIGHEST_ANYOF_HRx_BYTE(FLAGS(o))
+ : (OP(o) == ANYOFH || OP(o) == ANYOFR)
? 0xFF
- : HIGHEST_ANYOF_HRx_BYTE(FLAGS(o));
+ : lowest;
Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=%02X", lowest);
if (lowest != highest) {
Perl_sv_catpvf(aTHX_ sv, "-%02X", highest);
_invlist_invert(cp_list);
}
- put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
+ put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, 0, TRUE);
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
SvREFCNT_dec(cp_list);
SV *nonbitmap_invlist,
SV *only_utf8_locale_invlist,
const regnode * const node,
+ const U8 flags,
const bool force_as_is_display)
{
/* Appends to 'sv' a displayable version of the innards of the bracketed
* 'node' is the regex pattern ANYOF node. It is needed only when the
* above two parameters are not null, and is passed so that this
* routine can tease apart the various reasons for them.
+ * 'flags' is the flags field of 'node'
* 'force_as_is_display' is TRUE if this routine should definitely NOT try
* to invert things to see if that leads to a cleaner display. If
* FALSE, this routine is free to use its judgment about doing this.
literally */
SV* inverted_display; /* The output string when we invert the inputs */
- U8 flags = (node) ? ANYOF_FLAGS(node) : 0;
-
bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted
to match? */
/* We are biased in favor of displaying things without them being inverted,
UNI__PERL_FOLDS_TO_MULTI_CHAR]);
PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
- PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[
- UNI__PERL_NON_FINAL_FOLDS]);
-
PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);
PL_utf8_totitle = _new_invlist_C_array(Titlecase_Mapping_invlist);