#endif
#include "dquote_static.c"
+#ifndef PERL_IN_XSUB_RE
+# include "charclass_invlists.h"
+#endif
#ifdef op
#undef op
|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
else
data->flags &= ~SF_FIX_BEFORE_EOL;
- data->minlen_fixed=minlenp;
+ data->minlen_fixed=minlenp;
data->lookbehind_fixed=0;
}
else { /* *data->longest == data->longest_float */
#endif
switch (flags) {
+ case EXACT: break;
case EXACTFA:
case EXACTFU: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
case EXACTFL: folder = PL_fold_locale; break;
+ default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
}
trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
"%*sCompiling trie using list compiler\n",
(int)depth * 2 + 2, ""));
-
+
trie->states = (reg_trie_state *)
PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
sizeof(reg_trie_state) );
}});
+/* The below joins as many adjacent EXACTish nodes as possible into a single
+ * one, and looks for problematic sequences of characters whose folds vs.
+ * non-folds have sufficiently different lengths, that the optimizer would be
+ * fooled into rejecting legitimate matches of them, and the trie construction
+ * code can't cope with them. The joining is only done if:
+ * 1) there is room in the current conglomerated node to entirely contain the
+ * next one.
+ * 2) they are the exact same node type
+ *
+ * The adjacent nodes actually may be separated by NOTHING kind nodes, and
+ * these get optimized out
+ *
+ * If there are problematic code sequences, *min_subtract is set to the delta
+ * that the minimum size of the node can be less than its actual size. And,
+ * the node type of the result is changed to reflect that it contains these
+ * sequences.
+ *
+ * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
+ * and contains LATIN SMALL LETTER SHARP S
+ *
+ * This is as good a place as any to discuss the design of handling these
+ * problematic sequences. It's been wrong in Perl for a very long time. There
+ * are three code points in Unicode whose folded lengths differ so much from
+ * the un-folded lengths that it causes problems for the optimizer and trie
+ * construction. Why only these are problematic, and not others where lengths
+ * also differ is something I (khw) do not understand. New versions of Unicode
+ * might add more such code points. Hopefully the logic in fold_grind.t that
+ * figures out what to test (in part by verifying that each size-combination
+ * gets tested) will catch any that do come along, so they can be added to the
+ * special handling below. The chances of new ones are actually rather small,
+ * as most, if not all, of the world's scripts that have casefolding have
+ * already been encoded by Unicode. Also, a number of Unicode's decisions were
+ * made to allow compatibility with pre-existing standards, and almost all of
+ * those have already been dealt with. These would otherwise be the most
+ * likely candidates for generating further tricky sequences. In other words,
+ * Unicode by itself is unlikely to add new ones unless it is for compatibility
+ * with pre-existing standards, and there aren't many of those left.
+ *
+ * The previous designs for dealing with these involved assigning a special
+ * node for them. This approach doesn't work, as evidenced by this example:
+ * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
+ * Both these fold to "sss", but if the pattern is parsed to create a node of
+ * that would match just the \xDF, it won't be able to handle the case where a
+ * successful match would have to cross the node's boundary. The new approach
+ * that hopefully generally solves the problem generates an EXACTFU_SS node
+ * that is "sss".
+ *
+ * There are a number of components to the approach (a lot of work for just
+ * three code points!):
+ * 1) This routine examines each EXACTFish node that could contain the
+ * problematic sequences. It returns in *min_subtract how much to
+ * subtract from the the actual length of the string to get a real minimum
+ * for one that could match it. This number is usually 0 except for the
+ * problematic sequences. This delta is used by the caller to adjust the
+ * min length of the match, and the delta between min and max, so that the
+ * optimizer doesn't reject these possibilities based on size constraints.
+ * 2) These sequences are not currently correctly handled by the trie code
+ * either, so it changes the joined node type to ops that are not handled
+ * by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
+ * 3) This is sufficient for the two Greek sequences (described below), but
+ * the one involving the Sharp s (\xDF) needs more. The node type
+ * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
+ * sequence in it. For non-UTF-8 patterns and strings, this is the only
+ * case where there is a possible fold length change. That means that a
+ * regular EXACTFU node without UTF-8 involvement doesn't have to concern
+ * itself with length changes, and so can be processed faster. regexec.c
+ * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
+ * is pre-folded by regcomp.c. This saves effort in regex matching.
+ * However, probably mostly for historical reasons, the pre-folding isn't
+ * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
+ * nodes, as what they fold to isn't known until runtime.) The fold
+ * possibilities for the non-UTF8 patterns are quite simple, except for
+ * the sharp s. All the ones that don't involve a UTF-8 target string
+ * are members of a fold-pair, and arrays are set up for all of them
+ * that quickly find the other member of the pair. It might actually
+ * be faster to pre-fold these, but it isn't currently done, except for
+ * the sharp s. Code elsewhere in this file makes sure that it gets
+ * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
+ * issues described in the next item.
+ * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
+ * 'ss' or not is not knowable at compile time. It will match iff the
+ * target string is in UTF-8, unlike the EXACTFU nodes, where it always
+ * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
+ * it can't be folded to "ss" at compile time, unlike EXACTFU does as
+ * described in item 3). An assumption that the optimizer part of
+ * regexec.c (probably unwittingly) makes is that a character in the
+ * pattern corresponds to at most a single character in the target string.
+ * (And I do mean character, and not byte here, unlike other parts of the
+ * documentation that have never been updated to account for multibyte
+ * Unicode.) This assumption is wrong only in this case, as all other
+ * cases are either 1-1 folds when no UTF-8 is involved; or is true by
+ * virtue of having this file pre-fold UTF-8 patterns. I'm
+ * reluctant to try to change this assumption, so instead the code punts.
+ * This routine examines EXACTF nodes for the sharp s, and returns a
+ * boolean indicating whether or not the node is an EXACTF node that
+ * contains a sharp s. When it is true, the caller sets a flag that later
+ * causes the optimizer in this file to not set values for the floating
+ * and fixed string lengths, and thus avoids the optimizer code in
+ * regexec.c that makes the invalid assumption. Thus, there is no
+ * optimization based on string lengths for EXACTF nodes that contain the
+ * sharp s. This only happens for /id rules (which means the pattern
+ * isn't in UTF-8).
+ */
-
-
-#define JOIN_EXACT(scan,min,flags) \
+#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
if (PL_regkind[OP(scan)] == EXACT) \
- join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
+ join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
STATIC U32
-S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
+S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
/* Merge several consecutive EXACTish nodes into one. */
regnode *n = regnext(scan);
U32 stringok = 1;
PERL_UNUSED_ARG(val);
#endif
DEBUG_PEEP("join",scan,depth);
-
- /* Skip NOTHING, merge EXACT*. */
- while (n &&
- ( PL_regkind[OP(n)] == NOTHING ||
- (stringok && (OP(n) == OP(scan))))
+
+ /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
+ * EXACT ones that are mergeable to the current one. */
+ while (n
+ && (PL_regkind[OP(n)] == NOTHING
+ || (stringok && OP(n) == OP(scan)))
&& NEXT_OFF(n)
- && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
+ && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
+ {
if (OP(n) == TAIL || n > next)
stringok = 0;
else if (stringok) {
const unsigned int oldl = STR_LEN(scan);
regnode * const nnext = regnext(n);
+
+ if (oldl + STR_LEN(n) > U8_MAX)
+ break;
DEBUG_PEEP("merg",n,depth);
-
merged++;
- if (oldl + STR_LEN(n) > U8_MAX)
- break;
+
NEXT_OFF(scan) += NEXT_OFF(n);
STR_LEN(scan) += STR_LEN(n);
next = n + NODE_SZ_STR(n);
}
#endif
}
-#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
-#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
-#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
-#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
- if (UTF
- && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA)
- && ( STR_LEN(scan) >= 6 ) )
- {
- /*
- Two problematic code points in Unicode casefolding of EXACT nodes:
-
- U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
- U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
-
- which casefold to
-
- Unicode UTF-8
-
- U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
- U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
-
- This means that in case-insensitive matching (or "loose matching",
- as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
- length of the above casefolded versions) can match a target string
- of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
- This would rather mess up the minimum length computation.
-
- What we'll do is to look for the tail four bytes, and then peek
- at the preceding two bytes to see whether we need to decrease
- the minimum length by four (six minus two).
-
- Thanks to the design of UTF-8, there cannot be false matches:
- A sequence of valid UTF-8 bytes cannot be a subsequence of
- another valid sequence of UTF-8 bytes.
-
- */
- char * const s0 = STRING(scan), *s, *t;
- char * const s1 = s0 + STR_LEN(scan) - 1;
- char * const s2 = s1 - 4;
+ *min_subtract = 0;
+ *has_exactf_sharp_s = FALSE;
+
+ /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
+ * can now analyze for sequences of problematic code points. (Prior to
+ * this final joining, sequences could have been split over boundaries, and
+ * hence missed). The sequences only happen in folding, hence for any
+ * non-EXACT EXACTish node */
+ if (OP(scan) != EXACT) {
+ U8 *s;
+ U8 * s0 = (U8*) STRING(scan);
+ U8 * const s_end = s0 + STR_LEN(scan);
+
+ /* The below is perhaps overboard, but this allows us to save a test
+ * each time through the loop at the expense of a mask. This is
+ * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
+ * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
+ * This uses an exclusive 'or' to find that bit and then inverts it to
+ * form a mask, with just a single 0, in the bit position where 'S' and
+ * 's' differ. */
+ const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
+ const U8 s_masked = 's' & S_or_s_mask;
+
+ /* One pass is made over the node's string looking for all the
+ * possibilities. to avoid some tests in the loop, there are two main
+ * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
+ * non-UTF-8 */
+ if (UTF) {
+
+ /* There are two problematic Greek code points in Unicode
+ * casefolding
+ *
+ * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+ * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+ *
+ * which casefold to
+ *
+ * Unicode UTF-8
+ *
+ * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
+ * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
+ *
+ * This means that in case-insensitive matching (or "loose
+ * matching", as Unicode calls it), an EXACTF of length six (the
+ * UTF-8 encoded byte length of the above casefolded versions) can
+ * match a target string of length two (the byte length of UTF-8
+ * encoded U+0390 or U+03B0). This would rather mess up the
+ * minimum length computation. (there are other code points that
+ * also fold to these two sequences, but the delta is smaller)
+ *
+ * If these sequences are found, the minimum length is decreased by
+ * four (six minus two).
+ *
+ * Similarly, 'ss' may match the single char and byte LATIN SMALL
+ * LETTER SHARP S. We decrease the min length by 1 for each
+ * occurrence of 'ss' found */
+
#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
- const char t0[] = "\xaf\x49\xaf\x42";
-#else
- const char t0[] = "\xcc\x88\xcc\x81";
-#endif
- const char * const t1 = t0 + 3;
-
- for (s = s0 + 2;
- s < s2 && (t = ninstr(s, s1, t0, t1));
- s = t + 4) {
-#ifdef EBCDIC
- if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
- ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
+# define U390_first_byte 0xb4
+ const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
+# define U3B0_first_byte 0xb5
+ const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
#else
- if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
- ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+# define U390_first_byte 0xce
+ const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
+# define U3B0_first_byte 0xcf
+ const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
#endif
- *min -= 4;
- }
+ const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
+ yields a net of 0 */
+ /* Examine the string for one of the problematic sequences */
+ for (s = s0;
+ s < s_end - 1; /* Can stop 1 before the end, as minimum length
+ * sequence we are looking for is 2 */
+ s += UTF8SKIP(s))
+ {
+
+ /* Look for the first byte in each problematic sequence */
+ switch (*s) {
+ /* We don't have to worry about other things that fold to
+ * 's' (such as the long s, U+017F), as all above-latin1
+ * code points have been pre-folded */
+ case 's':
+ case 'S':
+
+ /* Current character is an 's' or 'S'. If next one is
+ * as well, we have the dreaded sequence */
+ if (((*(s+1) & S_or_s_mask) == s_masked)
+ /* These two node types don't have special handling
+ * for 'ss' */
+ && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
+ {
+ *min_subtract += 1;
+ OP(scan) = EXACTFU_SS;
+ s++; /* No need to look at this character again */
+ }
+ break;
+
+ case U390_first_byte:
+ if (s_end - s >= len
+
+ /* The 1's are because are skipping comparing the
+ * first byte */
+ && memEQ(s + 1, U390_tail, len - 1))
+ {
+ goto greek_sequence;
+ }
+ break;
+
+ case U3B0_first_byte:
+ if (! (s_end - s >= len
+ && memEQ(s + 1, U3B0_tail, len - 1)))
+ {
+ break;
+ }
+ greek_sequence:
+ *min_subtract += 4;
+
+ /* This can't currently be handled by trie's, so change
+ * the node type to indicate this. If EXACTFA and
+ * EXACTFL were ever to be handled by trie's, this
+ * would have to be changed. If this node has already
+ * been changed to EXACTFU_SS in this loop, leave it as
+ * is. (I (khw) think it doesn't matter in regexec.c
+ * for UTF patterns, but no need to change it */
+ if (OP(scan) == EXACTFU) {
+ OP(scan) = EXACTFU_NO_TRIE;
+ }
+ s += 6; /* We already know what this sequence is. Skip
+ the rest of it */
+ break;
+ }
+ }
+ }
+ else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
+
+ /* Here, the pattern is not UTF-8. We need to look only for the
+ * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
+ * in the final position. Otherwise we can stop looking 1 byte
+ * earlier because have to find both the first and second 's' */
+ const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
+
+ for (s = s0; s < upper; s++) {
+ switch (*s) {
+ case 'S':
+ case 's':
+ if (s_end - s > 1
+ && ((*(s+1) & S_or_s_mask) == s_masked))
+ {
+ *min_subtract += 1;
+
+ /* EXACTF nodes need to know that the minimum
+ * length changed so that a sharp s in the string
+ * can match this ss in the pattern, but they
+ * remain EXACTF nodes, as they are not trie'able,
+ * so don't have to invent a new node type to
+ * exclude them from the trie code */
+ if (OP(scan) != EXACTF) {
+ OP(scan) = EXACTFU_SS;
+ }
+ s++;
+ }
+ break;
+ case LATIN_SMALL_LETTER_SHARP_S:
+ if (OP(scan) == EXACTF) {
+ *has_exactf_sharp_s = TRUE;
+ }
+ break;
+ }
+ }
+ }
}
-
+
#ifdef DEBUGGING
/* Allow dumping but overwriting the collection of skipped
* ops and/or strings with fake optimized ops */
fake_study_recurse:
while ( scan && OP(scan) != END && scan < last ){
+ UV min_subtract = 0; /* How much to subtract from the minimum node
+ length to get a real minimum (because the
+ folded version may be shorter) */
+ bool has_exactf_sharp_s = FALSE;
/* Peephole optimizer: */
DEBUG_STUDYDATA("Peep:", data,depth);
DEBUG_PEEP("Peep",scan,depth);
- JOIN_EXACT(scan,&min,0);
+
+ /* Its not clear to khw or hv why this is done here, and not in the
+ * clauses that deal with EXACT nodes. khw's guess is that it's
+ * because of a previous design */
+ JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
/* Follow the next-chain of the current node and optimize
away all the NOTHINGs from it. */
int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
int noff;
regnode *n = scan;
-
+
/* Skip NOTHING and LONGJMP. */
while ((n = regnext(n))
&& ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
next = regnext(scan);
code = OP(scan);
/* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
-
+
if (OP(next) == code || code == IFTHEN) {
/* NOTE - There is similar code to this block below for handling
TRIE nodes on a re-study. If you change stuff here check there
I32 max1 = 0, min1 = I32_MAX, num = 0;
struct regnode_charclass_class accum;
regnode * const startbranch=scan;
-
+
if (flags & SCF_DO_SUBSTR)
SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
if (flags & SCF_DO_STCLASS)
a nested if into a case structure of sorts.
*/
-
+
int made=0;
if (!re_trie_maxbuff) {
re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
#define TRIE_TYPE_IS_SAFE 1
+Note that join_exact() assumes that the other types of EXACTFish nodes are not
+used in tries, so that would have to be updated if this changed
+
*/
#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
if ( last && TRIE_TYPE_IS_SAFE ) {
made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
-#ifdef TRIE_STUDY_OPT
+#ifdef TRIE_STUDY_OPT
if ( ((made == MADE_EXACT_TRIE &&
startbranch == first)
|| ( first_non_open == first )) &&
l = utf8_length(s, s + l);
uc = utf8_to_uvchr(s, NULL);
}
- min += l;
- if (flags & SCF_DO_SUBSTR)
- data->pos_min += l;
+ else if (has_exactf_sharp_s) {
+ RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
+ }
+ min += l - min_subtract;
+ if (min < 0) {
+ min = 0;
+ }
+ delta += min_subtract;
+ if (flags & SCF_DO_SUBSTR) {
+ data->pos_min += l - min_subtract;
+ if (data->pos_min < 0) {
+ data->pos_min = 0;
+ }
+ data->pos_delta += min_subtract;
+ if (min_subtract) {
+ data->longest = &(data->longest_float);
+ }
+ }
if (flags & SCF_DO_STCLASS_AND) {
/* Check whether it is compatible with what we know already! */
int compat = 1;
/* Also set the other member of the fold pair. In case
* that unicode semantics is called for at runtime, use
* the full latin1 fold. (Can't do this for locale,
- * because not known until runtime */
+ * because not known until runtime) */
ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
+
+ /* All other (EXACTFL handled above) folds except under
+ * /iaa that include s, S, and sharp_s also may include
+ * the others */
+ if (OP(scan) != EXACTFA) {
+ if (uc == 's' || uc == 'S') {
+ ANYOF_BITMAP_SET(data->start_class,
+ LATIN_SMALL_LETTER_SHARP_S);
+ }
+ else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
+ ANYOF_BITMAP_SET(data->start_class, 's');
+ ANYOF_BITMAP_SET(data->start_class, 'S');
+ }
+ }
}
}
else if (uc >= 0x100) {
* run-time */
ANYOF_BITMAP_SET(data->start_class,
PL_fold_latin1[uc]);
+
+ /* All folds except under /iaa that include s, S,
+ * and sharp_s also may include the others */
+ if (OP(scan) != EXACTFA) {
+ if (uc == 's' || uc == 'S') {
+ ANYOF_BITMAP_SET(data->start_class,
+ LATIN_SMALL_LETTER_SHARP_S);
+ }
+ else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
+ ANYOF_BITMAP_SET(data->start_class, 's');
+ ANYOF_BITMAP_SET(data->start_class, 'S');
+ }
+ }
}
}
data->start_class->flags &= ~ANYOF_EOS;
data->longest = &(data->longest_float);
}
}
- else if (OP(scan) == FOLDCHAR) {
- int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
- flags &= ~SCF_DO_STCLASS;
- min += 1;
- delta += d;
- if (flags & SCF_DO_SUBSTR) {
- SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
- data->pos_min += 1;
- data->pos_delta += d;
- data->longest = &(data->longest_float);
- }
- }
else if (REGNODE_SIMPLE(OP(scan))) {
int value = 0;
break;
CASE_SYNST_FNC(VERTWS);
CASE_SYNST_FNC(HORIZWS);
-
+
}
if (flags & SCF_DO_STCLASS_OR)
cl_and(data->start_class, and_withp);
flags &= ~SCF_DO_SUBSTR;
}
#endif /* old or new */
-#endif /* TRIE_STUDY_OPT */
+#endif /* TRIE_STUDY_OPT */
/* Else: zero-length, ignore. */
scan = regnext(scan);
DEBUG_r(if (!PL_colorset) reginitcolors());
+#ifndef PERL_IN_XSUB_RE
+ /* Initialize these here instead of as-needed, as is quick and avoids
+ * having to test them each time otherwise */
+ if (! PL_AboveLatin1) {
+ PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
+ PL_ASCII = _new_invlist_C_array(ASCII_invlist);
+ PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
+
+ PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
+ PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
+
+ PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
+ PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
+
+ PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
+ PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
+
+ PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
+
+ PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
+ PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
+
+ PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
+
+ PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
+ PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
+
+ PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
+ PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
+
+ PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
+ PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
+
+ PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
+ PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
+
+ PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
+ PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
+
+ PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
+ PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
+
+ PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
+ PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
+
+ PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
+ PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
+
+ PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
+
+ PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
+ PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
+
+ PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
+ PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
+ }
+#endif
+
exp = SvPV(pattern, plen);
if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
sawplus = 1;
else
first += regarglen[OP(first)];
-
+
first = NEXTOPER(first);
first_next= regnext(first);
}
else
ri->regstclass = first;
}
-#ifdef TRIE_STCLASS
+#ifdef TRIE_STCLASS
else if (PL_regkind[OP(first)] == TRIE &&
((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
{
make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
ri->regstclass = trie_op;
}
-#endif
+#endif
else if (REGNODE_SIMPLE(OP(first)))
ri->regstclass = first;
else if (PL_regkind[OP(first)] == BOUND ||
* it happens that c_offset_min has been invalidated, since the
* earlier string may buy us something the later one won't.]
*/
-
+
data.longest_fixed = newSVpvs("");
data.longest_float = newSVpvs("");
data.last_found = newSVpvs("");
&data, -1, NULL, NULL,
SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
-
+
CHECK_RESTUDY_GOTO;
{
I32 t,ml;
- if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
- && data.offset_fixed == data.offset_float_min
- && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
+ /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
+ if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
+ || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
+ && data.offset_fixed == data.offset_float_min
+ && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
goto remove_float; /* As in (a)+. */
/* copy the information about the longest float from the reg_scan_data
Be careful.
*/
longest_fixed_length = CHR_SVLEN(data.longest_fixed);
- if (longest_fixed_length
- || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
- && (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE))))
+
+ /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
+ if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
+ && (longest_fixed_length
+ || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
+ && (!(data.flags & SF_FIX_BEFORE_MEOL)
+ || (RExC_flags & RXf_PMf_MULTILINE)))) )
{
I32 t,ml;
I32 fake;
struct regnode_charclass_class ch_class;
I32 last_close = 0;
-
+
DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
scan = ri->program + 1;
return sv_dat;
}
else {
- Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
+ Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
+ (unsigned long) flags);
}
/* NOT REACHED */
}
#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
-#define INVLIST_ZERO_OFFSET 2 /* 0 or 1; must be last element in header */
+/* This is a combination of a version and data structure type, so that one
+ * being passed in can be validated to be an inversion list of the correct
+ * vintage. When the structure of the header is changed, a new random number
+ * in the range 2**31-1 should be generated and the new() method changed to
+ * insert that at this location. Then, if an auxiliary program doesn't change
+ * correspondingly, it will be discovered immediately */
+#define INVLIST_VERSION_ID_OFFSET 2
+#define INVLIST_VERSION_ID 1064334010
+
+/* For safety, when adding new elements, remember to #undef them at the end of
+ * the inversion list code section */
+
+#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
* contains the code point U+00000, and begins here. If 1, the inversion list
* doesn't contain U+0000, and it begins at the next UV in the array.
* properly */
*get_invlist_zero_addr(new_list) = UV_MAX;
+ *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
+#if HEADER_LENGTH != 4
+# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
+#endif
+
return new_list;
}
#endif
+STATIC SV*
+S__new_invlist_C_array(pTHX_ UV* list)
+{
+ /* Return a pointer to a newly constructed inversion list, initialized to
+ * point to <list>, which has to be in the exact correct inversion list
+ * form, including internal fields. Thus this is a dangerous routine that
+ * should not be used in the wrong hands */
+
+ SV* invlist = newSV_type(SVt_PV);
+
+ PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
+
+ SvPV_set(invlist, (char *) list);
+ SvLEN_set(invlist, 0); /* Means we own the contents, and the system
+ shouldn't touch it */
+ SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
+
+ if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
+ Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
+ }
+
+ return invlist;
+}
+
STATIC void
S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
{
#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
+#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
+
#ifndef PERL_IN_XSUB_RE
void
Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
if (array[final_element] > start
|| ELEMENT_RANGE_MATCHES_INVLIST(final_element))
{
- Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list");
+ Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
+ array[final_element], start,
+ ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
}
/* Here, it is a legal append. If the new range begins with the first
return;
}
+
void
-Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
+Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
{
/* Take the union of two inversion lists and point <output> to it. *output
* should be defined upon input, and if it points to one of the two lists,
- * the reference count to that list will be decremented.
+ * the reference count to that list will be decremented. The first list,
+ * <a>, may be NULL, in which case a copy of the second list is returned.
+ * If <complement_b> is TRUE, the union is taken of the complement
+ * (inversion) of <b> instead of b itself.
+ *
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
* length there. The preface says to incorporate its examples into your
*/
UV count = 0;
- PERL_ARGS_ASSERT__INVLIST_UNION;
+ PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
/* If either one is empty, the union is the other one */
- len_a = invlist_len(a);
- if (len_a == 0) {
+ if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
if (*output == a) {
- SvREFCNT_dec(a);
+ if (a != NULL) {
+ SvREFCNT_dec(a);
+ }
}
if (*output != b) {
*output = invlist_clone(b);
+ if (complement_b) {
+ _invlist_invert(*output);
+ }
} /* else *output already = b; */
return;
}
if (*output == b) {
SvREFCNT_dec(b);
}
- if (*output != a) {
- *output = invlist_clone(a);
- }
- /* else *output already = a; */
+
+ /* The complement of an empty list is a list that has everything in it,
+ * so the union with <a> includes everything too */
+ if (complement_b) {
+ if (a == *output) {
+ SvREFCNT_dec(a);
+ }
+ *output = _new_invlist(1);
+ _append_range_to_invlist(*output, 0, UV_MAX);
+ }
+ else if (*output != a) {
+ *output = invlist_clone(a);
+ }
+ /* else *output already = a; */
return;
}
array_a = invlist_array(a);
array_b = invlist_array(b);
+ /* If are to take the union of 'a' with the complement of b, set it
+ * up so are looking at b's complement. */
+ if (complement_b) {
+
+ /* To complement, we invert: if the first element is 0, remove it. To
+ * do this, we just pretend the array starts one later, and clear the
+ * flag as we don't have to do anything else later */
+ if (array_b[0] == 0) {
+ array_b++;
+ len_b--;
+ complement_b = FALSE;
+ }
+ else {
+
+ /* But if the first element is not zero, we unshift a 0 before the
+ * array. The data structure reserves a space for that 0 (which
+ * should be a '1' right now), so physical shifting is unneeded,
+ * but temporarily change that element to 0. Before exiting the
+ * routine, we must restore the element to '1' */
+ array_b--;
+ len_b++;
+ array_b[0] = 0;
+ }
+ }
+
/* Size the union for the worst case: that the sets are completely
* disjoint */
u = _new_invlist(len_a + len_b);
SvREFCNT_dec(*output);
}
+ /* If we've changed b, restore it */
+ if (complement_b) {
+ array_b[0] = 1;
+ }
+
*output = u;
return;
}
void
-Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
+Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
{
/* Take the intersection of two inversion lists and point <i> to it. *i
* should be defined upon input, and if it points to one of the two lists,
* the reference count to that list will be decremented.
+ * If <complement_b> is TRUE, the result will be the intersection of <a>
+ * and the complement (or inversion) of <b> instead of <b> directly.
+ *
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
* length there. The preface says to incorporate its examples into your
*/
UV count = 0;
- PERL_ARGS_ASSERT__INVLIST_INTERSECTION;
+ PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
- /* If either one is empty, the intersection is null */
+ /* Special case if either one is empty */
len_a = invlist_len(a);
if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
- /* If the result is the same as one of the inputs, the input is being
- * overwritten */
+ if (len_a != 0 && complement_b) {
+
+ /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
+ * be empty. Here, also we are using 'b's complement, which hence
+ * must be every possible code point. Thus the intersection is
+ * simply 'a'. */
+ if (*i != a) {
+ *i = invlist_clone(a);
+
+ if (*i == b) {
+ SvREFCNT_dec(b);
+ }
+ }
+ /* else *i is already 'a' */
+ return;
+ }
+
+ /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
+ * intersection must be empty */
if (*i == a) {
SvREFCNT_dec(a);
}
else if (*i == b) {
SvREFCNT_dec(b);
}
-
*i = _new_invlist(0);
return;
}
array_a = invlist_array(a);
array_b = invlist_array(b);
+ /* If are to take the intersection of 'a' with the complement of b, set it
+ * up so are looking at b's complement. */
+ if (complement_b) {
+
+ /* To complement, we invert: if the first element is 0, remove it. To
+ * do this, we just pretend the array starts one later, and clear the
+ * flag as we don't have to do anything else later */
+ if (array_b[0] == 0) {
+ array_b++;
+ len_b--;
+ complement_b = FALSE;
+ }
+ else {
+
+ /* But if the first element is not zero, we unshift a 0 before the
+ * array. The data structure reserves a space for that 0 (which
+ * should be a '1' right now), so physical shifting is unneeded,
+ * but temporarily change that element to 0. Before exiting the
+ * routine, we must restore the element to '1' */
+ array_b--;
+ len_b++;
+ array_b[0] = 0;
+ }
+ }
+
/* Size the intersection for the worst case: that the intersection ends up
* fragmenting everything to be completely disjoint */
r= _new_invlist(len_a + len_b);
SvREFCNT_dec(*i);
}
+ /* If we've changed b, restore it */
+ if (complement_b) {
+ array_b[0] = 1;
+ }
+
*i = r;
return;
}
return new_invlist;
}
-#ifndef PERL_IN_XSUB_RE
-void
-Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
-{
- /* Point <result> to an inversion list which consists of all elements in
- * <a> that aren't also in <b>. *result should be defined upon input, and
- * if it points to C<b> its reference count will be decremented. */
-
- PERL_ARGS_ASSERT__INVLIST_SUBTRACT;
- assert(a != b);
-
- /* Subtracting nothing retains the original */
- if (invlist_len(b) == 0) {
-
- if (*result == b) {
- SvREFCNT_dec(b);
- }
-
- /* If the result is not to be the same variable as the original, create
- * a copy */
- if (*result != a) {
- *result = invlist_clone(a);
- }
- } else {
- SV *b_copy = invlist_clone(b);
- _invlist_invert(b_copy); /* Everything not in 'b' */
-
- if (*result == b) {
- SvREFCNT_dec(b);
- }
-
- _invlist_intersection(a, b_copy, result); /* Everything in 'a' not in
- 'b' */
- SvREFCNT_dec(b_copy);
- }
-
- return;
-}
-#endif
-
PERL_STATIC_INLINE UV*
S_get_invlist_iter_addr(pTHX_ SV* invlist)
{
return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
}
+PERL_STATIC_INLINE UV*
+S_get_invlist_version_id_addr(pTHX_ SV* invlist)
+{
+ /* Return the address of the UV that contains the version id. */
+
+ PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
+
+ return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
+}
+
PERL_STATIC_INLINE void
S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
{
#undef INVLIST_LEN_OFFSET
#undef INVLIST_ZERO_OFFSET
#undef INVLIST_ITER_OFFSET
+#undef INVLIST_VERSION_ID
/* End of inversion list object */
RExC_parse++;
if (*RExC_parse!=')')
vFAIL("Expecting close bracket");
-
+
gen_recurse_regop:
if ( paren == '-' ) {
/*
RExC_parse++;
}
if (*RExC_parse != ')') {
- RExC_parse = s;
+ RExC_parse = s;
vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
}
if (!SIZE_ONLY) {
|| RExC_parse[1] == '<'
|| RExC_parse[1] == '{') { /* Lookahead or eval. */
I32 flag;
-
+
ret = reg_node(pRExC_state, LOGICAL);
if (!SIZE_ONLY)
ret->flags = 1;
Set_Node_Length(ret, 1);
}
}
-
+
if (!first && SIZE_ONLY)
RExC_extralen += 1; /* BRANCHJ */
break;
case 'p':
case 'P':
- {
+ {
char* const oldregxend = RExC_end;
#ifdef DEBUGGING
char* parse_start = RExC_parse - 2;
RExC_parse++;
defchar: {
- typedef enum {
- generic_char = 0,
- char_s,
- upsilon_1,
- upsilon_2,
- iota_1,
- iota_2,
- } char_state;
- char_state latest_char_state = generic_char;
register STRLEN len;
register UV ender;
register char *p;
char *s;
STRLEN foldlen;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
- regnode * orig_emit;
+ U8 node_type;
+
+ /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
+ * it is folded to 'ss' even if not utf8 */
+ bool is_exactfu_sharp_s;
ender = 0;
- orig_emit = RExC_emit; /* Save the original output node position in
- case we need to output a different node
- type */
- ret = reg_node(pRExC_state,
- (U8) ((! FOLD) ? EXACT
- : (LOC)
- ? EXACTFL
- : (MORE_ASCII_RESTRICTED)
- ? EXACTFA
- : (AT_LEAST_UNI_SEMANTICS)
- ? EXACTFU
- : EXACTF)
- );
+ node_type = ((! FOLD) ? EXACT
+ : (LOC)
+ ? EXACTFL
+ : (MORE_ASCII_RESTRICTED)
+ ? EXACTFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? EXACTFU
+ : EXACTF);
+ ret = reg_node(pRExC_state, node_type);
s = STRING(ret);
+
+ /* XXX The node can hold up to 255 bytes, yet this only goes to
+ * 127. I (khw) do not know why. Keeping it somewhat less than
+ * 255 allows us to not have to worry about overflow due to
+ * converting to utf8 and fold expansion, but that value is
+ * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
+ * split up by this limit into a single one using the real max of
+ * 255. Even at 127, this breaks under rare circumstances. If
+ * folding, we do not want to split a node at a character that is a
+ * non-final in a multi-char fold, as an input string could just
+ * happen to want to match across the node boundary. The join
+ * would solve that problem if the join actually happens. But a
+ * series of more than two nodes in a row each of 127 would cause
+ * the first join to succeed to get to 254, but then there wouldn't
+ * be room for the next one, which could at be one of those split
+ * multi-char folds. I don't know of any fool-proof solution. One
+ * could back off to end with only a code point that isn't such a
+ * non-final, but it is possible for there not to be any in the
+ * entire node. */
for (len = 0, p = RExC_parse - 1;
- len < 127 && p < RExC_end;
- len++)
+ len < 127 && p < RExC_end;
+ len++)
{
char * const oldp = p;
case 'x':
if (*++p == '{') {
char* const e = strchr(p, '}');
-
+
if (!e) {
RExC_parse = p + 1;
vFAIL("Missing right brace on \\x{}");
break;
} /* End of switch on the literal */
- /* Certain characters are problematic because their folded
- * length is so different from their original length that it
- * isn't handleable by the optimizer. They are therefore not
- * placed in an EXACTish node; and are here handled specially.
- * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S,
- * putting it in a special node keeps regexec from having to
- * deal with a non-utf8 multi-char fold */
- if (FOLD
- && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)))
- {
- /* We look for either side of the fold. For example \xDF
- * folds to 'ss'. We look for both the single character
- * \xDF and the sequence 'ss'. When we find something that
- * could be one of those, we stop and flush whatever we
- * have output so far into the EXACTish node that was being
- * built. Then restore the input pointer to what it was.
- * regatom will return that EXACT node, and will be called
- * again, positioned so the first character is the one in
- * question, which we return in a different node type.
- * The multi-char folds are a sequence, so the occurrence
- * of the first character in that sequence doesn't
- * necessarily mean that what follows is the rest of the
- * sequence. We keep track of that with a state machine,
- * with the state being set to the latest character
- * processed before the current one. Most characters will
- * set the state to 0, but if one occurs that is part of a
- * potential tricky fold sequence, the state is set to that
- * character, and the next loop iteration sees if the state
- * should progress towards the final folded-from character,
- * or if it was a false alarm. If it turns out to be a
- * false alarm, the character(s) will be output in a new
- * EXACTish node, and join_exact() will later combine them.
- * In the case of the 'ss' sequence, which is more common
- * and more easily checked, some look-ahead is done to
- * save time by ruling-out some false alarms */
- switch (ender) {
- default:
- latest_char_state = generic_char;
- break;
- case 's':
- case 'S':
- case 0x17F: /* LATIN SMALL LETTER LONG S */
- if (AT_LEAST_UNI_SEMANTICS) {
- if (latest_char_state == char_s) { /* 'ss' */
- ender = LATIN_SMALL_LETTER_SHARP_S;
- goto do_tricky;
- }
- else if (p < RExC_end) {
-
- /* Look-ahead at the next character. If it
- * is also an s, we handle as a sharp s
- * tricky regnode. */
- if (*p == 's' || *p == 'S') {
-
- /* But first flush anything in the
- * EXACTish buffer */
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- p++; /* Account for swallowing this
- 's' up */
- ender = LATIN_SMALL_LETTER_SHARP_S;
- goto do_tricky;
- }
- /* Here, the next character is not a
- * literal 's', but still could
- * evaluate to one if part of a \o{},
- * \x or \OCTAL-DIGIT. The minimum
- * length required for that is 4, eg
- * \x53 or \123 */
- else if (*p == '\\'
- && p < RExC_end - 4
- && (isDIGIT(*(p + 1))
- || *(p + 1) == 'x'
- || *(p + 1) == 'o' ))
- {
-
- /* Here, it could be an 's', too much
- * bother to figure it out here. Flush
- * the buffer if any; when come back
- * here, set the state so know that the
- * previous char was an 's' */
- if (len != 0) {
- latest_char_state = generic_char;
- p = oldp;
- goto loopdone;
- }
- latest_char_state = char_s;
- break;
- }
- }
- }
-
- /* Here, can't be an 'ss' sequence, or at least not
- * one that could fold to/from the sharp ss */
- latest_char_state = generic_char;
- break;
- case 0x03C5: /* First char in upsilon series */
- case 0x03A5: /* Also capital UPSILON, which folds to
- 03C5, and hence exhibits the same
- problem */
- if (p < RExC_end - 4) { /* Need >= 4 bytes left */
- latest_char_state = upsilon_1;
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- }
- else {
- latest_char_state = generic_char;
- }
- break;
- case 0x03B9: /* First char in iota series */
- case 0x0399: /* Also capital IOTA */
- case 0x1FBE: /* GREEK PROSGEGRAMMENI folds to 3B9 */
- case 0x0345: /* COMBINING GREEK YPOGEGRAMMENI folds
- to 3B9 */
- if (p < RExC_end - 4) {
- latest_char_state = iota_1;
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- }
- else {
- latest_char_state = generic_char;
- }
- break;
- case 0x0308:
- if (latest_char_state == upsilon_1) {
- latest_char_state = upsilon_2;
- }
- else if (latest_char_state == iota_1) {
- latest_char_state = iota_2;
- }
- else {
- latest_char_state = generic_char;
- }
- break;
- case 0x301:
- if (latest_char_state == upsilon_2) {
- ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
- goto do_tricky;
- }
- else if (latest_char_state == iota_2) {
- ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
- goto do_tricky;
- }
- latest_char_state = generic_char;
- break;
-
- /* These are the tricky fold characters. Flush any
- * buffer first. (When adding to this list, also should
- * add them to fold_grind.t to make sure get tested) */
- case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
- case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
- case LATIN_SMALL_LETTER_SHARP_S:
- case LATIN_CAPITAL_LETTER_SHARP_S:
- case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */
- case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */
- if (len != 0) {
- p = oldp;
- goto loopdone;
- }
- /* FALL THROUGH */
- do_tricky: {
- char* const oldregxend = RExC_end;
- U8 tmpbuf[UTF8_MAXBYTES+1];
-
- /* Here, we know we need to generate a special
- * regnode, and 'ender' contains the tricky
- * character. What's done is to pretend it's in a
- * [bracketed] class, and let the code that deals
- * with those handle it, as that code has all the
- * intelligence necessary. First save the current
- * parse state, get rid of the already allocated
- * but empty EXACT node that the ANYOFV node will
- * replace, and point the parse to a buffer which
- * we fill with the character we want the regclass
- * code to think is being parsed */
- RExC_emit = orig_emit;
- RExC_parse = (char *) tmpbuf;
- if (UTF) {
- U8 *d = uvchr_to_utf8(tmpbuf, ender);
- *d = '\0';
- RExC_end = (char *) d;
- }
- else { /* ender above 255 already excluded */
- tmpbuf[0] = (U8) ender;
- tmpbuf[1] = '\0';
- RExC_end = RExC_parse + 1;
- }
-
- ret = regclass(pRExC_state,depth+1);
-
- /* Here, have parsed the buffer. Reset the parse to
- * the actual input, and return */
- RExC_end = oldregxend;
- RExC_parse = p - 1;
-
- Set_Node_Offset(ret, RExC_parse);
- Set_Node_Cur_Length(ret);
- nextchar(pRExC_state);
- *flagp |= HASWIDTH|SIMPLE;
- return ret;
- }
- }
- }
-
+ is_exactfu_sharp_s = (node_type == EXACTFU
+ && ender == LATIN_SMALL_LETTER_SHARP_S);
if ( RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
- if (UTF && FOLD) {
+ if ((UTF && FOLD) || is_exactfu_sharp_s) {
/* Prime the casefolded buffer. Locale rules, which apply
* only to code points < 256, aren't known until execution,
* so for them, just output the original character using
- * utf8 */
+ * utf8. If we start to fold non-UTF patterns, be sure to
+ * update join_exact() */
if (LOC && ender < 256) {
if (UNI_IS_INVARIANT(ender)) {
*tmpbuf = (U8) ender;
if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
- else if (UTF) {
+ else if (UTF || is_exactfu_sharp_s) {
if (FOLD) {
/* Emit all the Unicode characters. */
STRLEN numlen;
}
break;
}
- if (UTF) {
+ if (UTF || is_exactfu_sharp_s) {
if (FOLD) {
/* Emit all the Unicode characters. */
STRLEN numlen;
*flagp |= HASWIDTH;
if (len == 1 && UNI_IS_INVARIANT(ender))
*flagp |= SIMPLE;
-
+
if (SIZE_ONLY)
RExC_size += STR_SZ(len);
else {
POSIXCC(UCHARAT(RExC_parse))) {
const char c = UCHARAT(RExC_parse);
char* const s = RExC_parse++;
-
+
while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
RExC_parse++;
if (RExC_parse == RExC_end)
}
}
-/* No locale test, and always Unicode semantics, no ignore-case differences */
-#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
-ANYOF_##NAME: \
- for (value = 0; value < 256; value++) \
- if (TEST) \
- stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
- yesno = '+'; \
- what = WORD; \
- break; \
-case ANYOF_N##NAME: \
- for (value = 0; value < 256; value++) \
- if (!TEST) \
- stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
- yesno = '!'; \
- what = WORD; \
- break
-
-/* Like the above, but there are differences if we are in uni-8-bit or not, so
- * there are two tests passed in, to use depending on that. There aren't any
- * cases where the label is different from the name, so no need for that
- * parameter.
- * Sets 'what' to WORD which is the property name for non-bitmap code points;
- * But, uses FOLD_WORD instead if /i has been selected, to allow a different
- * property name */
-#define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD) \
-ANYOF_##NAME: \
- if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
- else if (UNI_SEMANTICS) { \
- for (value = 0; value < 256; value++) { \
- if (TEST_8(value)) stored += \
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
- } \
- } \
- else { \
- for (value = 0; value < 128; value++) { \
- if (TEST_7(UNI_TO_NATIVE(value))) stored += \
- set_regclass_bit(pRExC_state, ret, \
- (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
- } \
- } \
- yesno = '+'; \
- if (FOLD) { \
- what = FOLD_WORD; \
- } \
- else { \
- what = WORD; \
- } \
- break; \
-case ANYOF_N##NAME: \
- if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
- else if (UNI_SEMANTICS) { \
- for (value = 0; value < 256; value++) { \
- if (! TEST_8(value)) stored += \
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
- } \
- } \
- else { \
- for (value = 0; value < 128; value++) { \
- if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
- pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
- } \
- if (AT_LEAST_ASCII_RESTRICTED) { \
- for (value = 128; value < 256; value++) { \
- stored += set_regclass_bit( \
- pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
- } \
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \
- } \
- else { \
- /* For a non-ut8 target string with DEPENDS semantics, all above \
- * ASCII Latin1 code points match the complement of any of the \
- * classes. But in utf8, they have their Unicode semantics, so \
- * can't just set them in the bitmap, or else regexec.c will think \
- * they matched when they shouldn't. */ \
- ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \
- } \
- } \
- yesno = '!'; \
- if (FOLD) { \
- what = FOLD_WORD; \
- } \
- else { \
- what = WORD; \
- } \
- break
+/* Generate the code to add a full posix character <class> to the bracketed
+ * character class given by <node>. (<node> is needed only under locale rules)
+ * destlist is the inversion list for non-locale rules that this class is
+ * to be added to
+ * sourcelist is the ASCII-range inversion list to add under /a rules
+ * Xsourcelist is the full Unicode range list to use otherwise. */
+#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
+ if (LOC) { \
+ SV* scratch_list = NULL; \
+ \
+ /* Set this class in the node for runtime matching */ \
+ ANYOF_CLASS_SET(node, class); \
+ \
+ /* For above Latin1 code points, we use the full Unicode range */ \
+ _invlist_intersection(PL_AboveLatin1, \
+ Xsourcelist, \
+ &scratch_list); \
+ /* And set the output to it, adding instead if there already is an \
+ * output. Checking if <destlist> is NULL first saves an extra \
+ * clone. Its reference count will be decremented at the next \
+ * union, etc, or if this is the only instance, at the end of the \
+ * routine */ \
+ if (! destlist) { \
+ destlist = scratch_list; \
+ } \
+ else { \
+ _invlist_union(destlist, scratch_list, &destlist); \
+ SvREFCNT_dec(scratch_list); \
+ } \
+ } \
+ else { \
+ /* For non-locale, just add it to any existing list */ \
+ _invlist_union(destlist, \
+ (AT_LEAST_ASCII_RESTRICTED) \
+ ? sourcelist \
+ : Xsourcelist, \
+ &destlist); \
+ }
+
+/* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
+ */
+#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
+ if (LOC) { \
+ SV* scratch_list = NULL; \
+ ANYOF_CLASS_SET(node, class); \
+ _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
+ if (! destlist) { \
+ destlist = scratch_list; \
+ } \
+ else { \
+ _invlist_union(destlist, scratch_list, &destlist); \
+ SvREFCNT_dec(scratch_list); \
+ } \
+ } \
+ else { \
+ _invlist_union_complement_2nd(destlist, \
+ (AT_LEAST_ASCII_RESTRICTED) \
+ ? sourcelist \
+ : Xsourcelist, \
+ &destlist); \
+ /* Under /d, everything in the upper half of the Latin1 range \
+ * matches this complement */ \
+ if (DEPENDS_SEMANTICS) { \
+ ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
+ } \
+ }
+
+/* Generate the code to add a posix character <class> to the bracketed
+ * character class given by <node>. (<node> is needed only under locale rules)
+ * destlist is the inversion list for non-locale rules that this class is
+ * to be added to
+ * sourcelist is the ASCII-range inversion list to add under /a rules
+ * l1_sourcelist is the Latin1 range list to use otherwise.
+ * Xpropertyname is the name to add to <run_time_list> of the property to
+ * specify the code points above Latin1 that will have to be
+ * determined at run-time
+ * run_time_list is a SV* that contains text names of properties that are to
+ * be computed at run time. This concatenates <Xpropertyname>
+ * to it, apppropriately
+ * This is essentially DO_POSIX, but we know only the Latin1 values at compile
+ * time */
+#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
+ l1_sourcelist, Xpropertyname, run_time_list) \
+ /* If not /a matching, there are going to be code points we will have \
+ * to defer to runtime to look-up */ \
+ if (! AT_LEAST_ASCII_RESTRICTED) { \
+ Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
+ } \
+ if (LOC) { \
+ ANYOF_CLASS_SET(node, class); \
+ } \
+ else { \
+ _invlist_union(destlist, \
+ (AT_LEAST_ASCII_RESTRICTED) \
+ ? sourcelist \
+ : l1_sourcelist, \
+ &destlist); \
+ }
+
+/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
+ * this and DO_N_POSIX */
+#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
+ l1_sourcelist, Xpropertyname, run_time_list) \
+ if (AT_LEAST_ASCII_RESTRICTED) { \
+ _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
+ } \
+ else { \
+ Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
+ if (LOC) { \
+ ANYOF_CLASS_SET(node, namedclass); \
+ } \
+ else { \
+ SV* scratch_list = NULL; \
+ _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
+ if (! destlist) { \
+ destlist = scratch_list; \
+ } \
+ else { \
+ _invlist_union(destlist, scratch_list, &destlist); \
+ SvREFCNT_dec(scratch_list); \
+ } \
+ if (DEPENDS_SEMANTICS) { \
+ ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
+ } \
+ } \
+ }
STATIC U8
S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
SV *listsv = NULL;
STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
than just initialized. */
+ SV* properties = NULL; /* Code points that match \p{} \P{} */
+ UV element_count = 0; /* Number of distinct elements in the class.
+ Optimizations may be possible if this is tiny */
UV n;
+ /* Unicode properties are stored in a swash; this holds the current one
+ * being parsed. If this swash is the only above-latin1 component of the
+ * character class, an optimization is to pass it directly on to the
+ * execution engine. Otherwise, it is set to NULL to indicate that there
+ * are other things in the class that have to be dealt with at execution
+ * time */
+ SV* swash = NULL; /* Code points that match \p{} \P{} */
+
+ /* Set if a component of this character class is user-defined; just passed
+ * on to the engine */
+ UV has_user_defined_property = 0;
+
/* code points this node matches that can't be stored in the bitmap */
SV* nonbitmap = NULL;
namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
- if (!range)
+ if (!range) {
rangebegin = RExC_parse;
+ element_count++;
+ }
if (UTF) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
n = 1;
}
if (!SIZE_ONLY) {
+ SV** invlistsvp;
+ SV* invlist;
+ char* name;
if (UCHARAT(RExC_parse) == '^') {
RExC_parse++;
n--;
n--;
}
}
+ /* Try to get the definition of the property into
+ * <invlist>. If /i is in effect, the effective property
+ * will have its name be <__NAME_i>. The design is
+ * discussed in commit
+ * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
+ Newx(name, n + sizeof("_i__\n"), char);
+
+ sprintf(name, "%s%.*s%s\n",
+ (FOLD) ? "__" : "",
+ (int)n,
+ RExC_parse,
+ (FOLD) ? "_i" : ""
+ );
+
+ /* Look up the property name, and get its swash and
+ * inversion list, if the property is found */
+ if (swash) {
+ SvREFCNT_dec(swash);
+ }
+ swash = _core_swash_init("utf8", name, &PL_sv_undef,
+ 1, /* binary */
+ 0, /* not tr/// */
+ TRUE, /* this routine will handle
+ undefined properties */
+ NULL, FALSE /* No inversion list */
+ );
+ if ( ! swash
+ || ! SvROK(swash)
+ || ! SvTYPE(SvRV(swash)) == SVt_PVHV
+ || ! (invlistsvp =
+ hv_fetchs(MUTABLE_HV(SvRV(swash)),
+ "INVLIST", FALSE))
+ || ! (invlist = *invlistsvp))
+ {
+ if (swash) {
+ SvREFCNT_dec(swash);
+ swash = NULL;
+ }
+
+ /* Here didn't find it. It could be a user-defined
+ * property that will be available at run-time. Add it
+ * to the list to look up then */
+ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
+ (value == 'p' ? '+' : '!'),
+ name);
+ has_user_defined_property = 1;
+
+ /* We don't know yet, so have to assume that the
+ * property could match something in the Latin1 range,
+ * hence something that isn't utf8 */
+ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
+ }
+ else {
+
+ /* Here, did get the swash and its inversion list. If
+ * the swash is from a user-defined property, then this
+ * whole character class should be regarded as such */
+ SV** user_defined_svp =
+ hv_fetchs(MUTABLE_HV(SvRV(swash)),
+ "USER_DEFINED", FALSE);
+ if (user_defined_svp) {
+ has_user_defined_property
+ |= SvUV(*user_defined_svp);
+ }
+
+ /* Invert if asking for the complement */
+ if (value == 'P') {
+ _invlist_union_complement_2nd(properties, invlist, &properties);
- /* Add the property name to the list. If /i matching, give
- * a different name which consists of the normal name
- * sandwiched between two underscores and '_i'. The design
- * is discussed in the commit message for this. */
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
- (value=='p' ? '+' : '!'),
- (FOLD) ? "__" : "",
- (int)n,
- RExC_parse,
- (FOLD) ? "_i" : ""
- );
+ /* The swash can't be used as-is, because we've
+ * inverted things; delay removing it to here after
+ * have copied its invlist above */
+ SvREFCNT_dec(swash);
+ swash = NULL;
+ }
+ else {
+ _invlist_union(properties, invlist, &properties);
+ }
+ }
+ Safefree(name);
}
RExC_parse = e + 1;
-
- /* The \p could match something in the Latin1 range, hence
- * something that isn't utf8 */
- ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
namedclass = ANYOF_MAX; /* no official name, but it's named */
/* \p means they want Unicode semantics */
range = 0; /* this was not a true range */
}
-
-
if (!SIZE_ONLY) {
- const char *what = NULL;
- char yesno = 0;
/* Possible truncation here but in some 64-bit environments
* the compiler gets heartburn about switch on 64-bit values.
* A similar issue a little earlier when switching on value.
* --jhi */
switch ((I32)namedclass) {
-
- case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum");
- case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha");
- case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank");
- case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl");
- case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph");
- case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i");
- case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint");
- case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace");
- case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct");
- case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i");
- /* \s, \w match all unicode if utf8. */
- case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl");
- case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word");
- case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit");
- case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
- case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
+
+ case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+ break;
+ case ANYOF_NALNUMC:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+ break;
+ case ANYOF_ALPHA:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+ break;
+ case ANYOF_NALPHA:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+ break;
case ANYOF_ASCII:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_ASCII);
- else {
- for (value = 0; value < 128; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
+ if (LOC) {
+ ANYOF_CLASS_SET(ret, namedclass);
}
- yesno = '+';
- what = NULL; /* Doesn't match outside ascii, so
- don't want to add +utf8:: */
+ else {
+ _invlist_union(properties, PL_ASCII, &properties);
+ }
break;
case ANYOF_NASCII:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NASCII);
- else {
- for (value = 128; value < 256; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
+ if (LOC) {
+ ANYOF_CLASS_SET(ret, namedclass);
}
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
- yesno = '!';
- what = "ASCII";
- break;
+ else {
+ _invlist_union_complement_2nd(properties,
+ PL_ASCII, &properties);
+ if (DEPENDS_SEMANTICS) {
+ ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
+ }
+ }
+ break;
+ case ANYOF_BLANK:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixBlank, PL_XPosixBlank);
+ break;
+ case ANYOF_NBLANK:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixBlank, PL_XPosixBlank);
+ break;
+ case ANYOF_CNTRL:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixCntrl, PL_XPosixCntrl);
+ break;
+ case ANYOF_NCNTRL:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixCntrl, PL_XPosixCntrl);
+ break;
case ANYOF_DIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+ break;
+ case ANYOF_NDIGIT:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+ break;
+ case ANYOF_GRAPH:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+ break;
+ case ANYOF_NGRAPH:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+ break;
+ case ANYOF_HORIZWS:
+ /* For these, we use the nonbitmap, as /d doesn't make a
+ * difference in what these match. There would be problems
+ * if these characters had folds other than themselves, as
+ * nonbitmap is subject to folding. It turns out that \h
+ * is just a synonym for XPosixBlank */
+ _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
+ break;
+ case ANYOF_NHORIZWS:
+ _invlist_union_complement_2nd(nonbitmap,
+ PL_XPosixBlank, &nonbitmap);
+ break;
+ case ANYOF_LOWER:
+ case ANYOF_NLOWER:
+ { /* These require special handling, as they differ under
+ folding, matching Cased there (which in the ASCII range
+ is the same as Alpha */
+
+ SV* ascii_source;
+ SV* l1_source;
+ const char *Xname;
+
+ if (FOLD && ! LOC) {
+ ascii_source = PL_PosixAlpha;
+ l1_source = PL_L1Cased;
+ Xname = "Cased";
+ }
else {
- /* consecutive digits assumed */
- for (value = '0'; value <= '9'; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
+ ascii_source = PL_PosixLower;
+ l1_source = PL_L1PosixLower;
+ Xname = "XPosixLower";
+ }
+ if (namedclass == ANYOF_LOWER) {
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ ascii_source, l1_source, Xname, listsv);
+ }
+ else {
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
+ properties, ascii_source, l1_source, Xname, listsv);
}
- yesno = '+';
- what = "Digit";
break;
- case ANYOF_NDIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
+ }
+ case ANYOF_PRINT:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+ break;
+ case ANYOF_NPRINT:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+ break;
+ case ANYOF_PUNCT:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+ break;
+ case ANYOF_NPUNCT:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+ break;
+ case ANYOF_PSXSPC:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixSpace, PL_XPosixSpace);
+ break;
+ case ANYOF_NPSXSPC:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixSpace, PL_XPosixSpace);
+ break;
+ case ANYOF_SPACE:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PerlSpace, PL_XPerlSpace);
+ break;
+ case ANYOF_NSPACE:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PerlSpace, PL_XPerlSpace);
+ break;
+ case ANYOF_UPPER: /* Same as LOWER, above */
+ case ANYOF_NUPPER:
+ {
+ SV* ascii_source;
+ SV* l1_source;
+ const char *Xname;
+
+ if (FOLD && ! LOC) {
+ ascii_source = PL_PosixAlpha;
+ l1_source = PL_L1Cased;
+ Xname = "Cased";
+ }
else {
- /* consecutive digits assumed */
- for (value = 0; value < '0'; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
- for (value = '9' + 1; value < 256; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
+ ascii_source = PL_PosixUpper;
+ l1_source = PL_L1PosixUpper;
+ Xname = "XPosixUpper";
}
- yesno = '!';
- what = "Digit";
- if (AT_LEAST_ASCII_RESTRICTED ) {
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+ if (namedclass == ANYOF_UPPER) {
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ ascii_source, l1_source, Xname, listsv);
}
- break;
+ else {
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
+ properties, ascii_source, l1_source, Xname, listsv);
+ }
+ break;
+ }
+ case ANYOF_ALNUM: /* Really is 'Word' */
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+ break;
+ case ANYOF_NALNUM:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+ break;
+ case ANYOF_VERTWS:
+ /* For these, we use the nonbitmap, as /d doesn't make a
+ * difference in what these match. There would be problems
+ * if these characters had folds other than themselves, as
+ * nonbitmap is subject to folding */
+ _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
+ break;
+ case ANYOF_NVERTWS:
+ _invlist_union_complement_2nd(nonbitmap,
+ PL_VertSpace, &nonbitmap);
+ break;
+ case ANYOF_XDIGIT:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixXDigit, PL_XPosixXDigit);
+ break;
+ case ANYOF_NXDIGIT:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixXDigit, PL_XPosixXDigit);
+ break;
case ANYOF_MAX:
/* this is to handle \p and \P */
break;
vFAIL("Invalid [::] class");
break;
}
- if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
- /* Strings such as "+utf8::isWord\n" */
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what);
- }
continue;
}
if (! PL_utf8_foldable) {
SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
PL_utf8_foldable = _swash_to_invlist(swash);
+ SvREFCNT_dec(swash);
}
/* This is a hash that for a particular fold gives all characters
}
}
+ /* And combine the result (if any) with any inversion list from properties.
+ * The lists are kept separate up to now because we don't want to fold the
+ * properties */
+ if (properties) {
+ if (nonbitmap) {
+ _invlist_union(nonbitmap, properties, &nonbitmap);
+ SvREFCNT_dec(properties);
+ }
+ else {
+ nonbitmap = properties;
+ }
+ }
+
+ /* Here, <nonbitmap> contains all the code points we can determine at
+ * compile time that we haven't put into the bitmap. Go through it, and
+ * for things that belong in the bitmap, put them there, and delete from
+ * <nonbitmap> */
+ if (nonbitmap) {
+
+ /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
+ * possibly only should match when the target string is UTF-8 */
+ UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
+
+ /* This gets set if we actually need to modify things */
+ bool change_invlist = FALSE;
+
+ UV start, end;
+
+ /* Start looking through <nonbitmap> */
+ invlist_iterinit(nonbitmap);
+ while (invlist_iternext(nonbitmap, &start, &end)) {
+ UV high;
+ int i;
+
+ /* Quit if are above what we should change */
+ if (start > max_cp_to_set) {
+ break;
+ }
+
+ change_invlist = TRUE;
+
+ /* Set all the bits in the range, up to the max that we are doing */
+ high = (end < max_cp_to_set) ? end : max_cp_to_set;
+ for (i = start; i <= (int) high; i++) {
+ if (! ANYOF_BITMAP_TEST(ret, i)) {
+ ANYOF_BITMAP_SET(ret, i);
+ stored++;
+ prevvalue = value;
+ value = i;
+ }
+ }
+ }
+
+ /* Done with loop; remove any code points that are in the bitmap from
+ * <nonbitmap> */
+ if (change_invlist) {
+ _invlist_subtract(nonbitmap,
+ (DEPENDS_SEMANTICS)
+ ? PL_ASCII
+ : PL_Latin1,
+ &nonbitmap);
+ }
+
+ /* If have completely emptied it, remove it completely */
+ if (invlist_len(nonbitmap) == 0) {
+ SvREFCNT_dec(nonbitmap);
+ nonbitmap = NULL;
+ }
+ }
+
/* Here, we have calculated what code points should be in the character
- * class.
+ * class. <nonbitmap> does not overlap the bitmap except possibly in the
+ * case of DEPENDS rules.
*
* Now we can see about various optimizations. Fold calculation (which we
* did above) needs to take place before inversion. Otherwise /[^k]/i
|| (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
&& SvCUR(listsv) == initial_listsv_len)
{
+ int i;
if (! nonbitmap) {
- for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
- ANYOF_BITMAP(ret)[value] ^= 0xFF;
+ for (i = 0; i < 256; ++i) {
+ if (ANYOF_BITMAP_TEST(ret, i)) {
+ ANYOF_BITMAP_CLEAR(ret, i);
+ }
+ else {
+ ANYOF_BITMAP_SET(ret, i);
+ prevvalue = value;
+ value = i;
+ }
+ }
/* The inversion means that everything above 255 is matched */
ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
}
else {
- /* Here, also has things outside the bitmap. Go through each bit
- * individually and add it to the list to get rid of from those
- * things not in the bitmap */
- SV *remove_list = _new_invlist(2);
+ /* Here, also has things outside the bitmap that may overlap with
+ * the bitmap. We have to sync them up, so that they get inverted
+ * in both places. Earlier, we removed all overlaps except in the
+ * case of /d rules, so no syncing is needed except for this case
+ */
+ SV *remove_list = NULL;
+
+ if (DEPENDS_SEMANTICS) {
+ UV start, end;
+
+ /* Set the bits that correspond to the ones that aren't in the
+ * bitmap. Otherwise, when we invert, we'll miss these.
+ * Earlier, we removed from the nonbitmap all code points
+ * < 128, so there is no extra work here */
+ invlist_iterinit(nonbitmap);
+ while (invlist_iternext(nonbitmap, &start, &end)) {
+ if (start > 255) { /* The bit map goes to 255 */
+ break;
+ }
+ if (end > 255) {
+ end = 255;
+ }
+ for (i = start; i <= (int) end; ++i) {
+ ANYOF_BITMAP_SET(ret, i);
+ prevvalue = value;
+ value = i;
+ }
+ }
+ }
+
+ /* Now invert both the bitmap and the nonbitmap. Anything in the
+ * bitmap has to also be removed from the non-bitmap, but again,
+ * there should not be overlap unless is /d rules. */
_invlist_invert(nonbitmap);
- for (value = 0; value < 256; ++value) {
- if (ANYOF_BITMAP_TEST(ret, value)) {
- ANYOF_BITMAP_CLEAR(ret, value);
- remove_list = add_cp_to_invlist(remove_list, value);
+
+ /* Any swash can't be used as-is, because we've inverted things */
+ if (swash) {
+ SvREFCNT_dec(swash);
+ swash = NULL;
+ }
+
+ for (i = 0; i < 256; ++i) {
+ if (ANYOF_BITMAP_TEST(ret, i)) {
+ ANYOF_BITMAP_CLEAR(ret, i);
+ if (DEPENDS_SEMANTICS) {
+ if (! remove_list) {
+ remove_list = _new_invlist(2);
+ }
+ remove_list = add_cp_to_invlist(remove_list, i);
+ }
}
else {
- ANYOF_BITMAP_SET(ret, value);
+ ANYOF_BITMAP_SET(ret, i);
+ prevvalue = value;
+ value = i;
}
}
/* And do the removal */
- _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
- SvREFCNT_dec(remove_list);
+ if (DEPENDS_SEMANTICS) {
+ if (remove_list) {
+ _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
+ SvREFCNT_dec(remove_list);
+ }
+ }
+ else {
+ /* There is no overlap for non-/d, so just delete anything
+ * below 256 */
+ _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
+ }
}
stored = 256 - stored;
/* Folding in the bitmap is taken care of above, but not for locale (for
* which we have to wait to see what folding is in effect at runtime), and
- * for things not in the bitmap. Set run-time fold flag for these */
- if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
+ * for some things not in the bitmap (only the upper latin folds in this
+ * case, as all other single-char folding has been set above). Set
+ * run-time fold flag for these */
+ if (FOLD && (LOC
+ || (DEPENDS_SEMANTICS
+ && nonbitmap
+ && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
+ || unicode_alternate))
+ {
ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
}
* is just the lower case of the current one (which may resolve to
* itself, or to the other one */
value = toLOWER_LATIN1(value);
- if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
- /* To join adjacent nodes, they must be the exact EXACTish
- * type. Try to use the most likely type, by using EXACTFU if
- * the regex calls for them, or is required because the
- * character is non-ASCII */
+ /* To join adjacent nodes, they must be the exact EXACTish type.
+ * Try to use the most likely type, by using EXACTFA if possible,
+ * then EXACTFU if the regex calls for it, or is required because
+ * the character is non-ASCII. (If <value> is ASCII, its fold is
+ * also ASCII for the cases where we get here.) */
+ if (MORE_ASCII_RESTRICTED && isASCII(value)) {
+ op = EXACTFA;
+ }
+ else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
op = EXACTFU;
}
else { /* Otherwise, more likely to be EXACTF type */
return ret;
}
+ /* If there is a swash and more than one element, we can't use the swash in
+ * the optimization below. */
+ if (swash && element_count > 1) {
+ SvREFCNT_dec(swash);
+ swash = NULL;
+ }
if (! nonbitmap
&& SvCUR(listsv) == initial_listsv_len
&& ! unicode_alternate)
SvREFCNT_dec(unicode_alternate);
}
else {
-
+ /* av[0] stores the character class description in its textual form:
+ * used later (regexec.c:Perl_regclass_swash()) to initialize the
+ * appropriate swash, and is also useful for dumping the regnode.
+ * av[1] if NULL, is a placeholder to later contain the swash computed
+ * from av[0]. But if no further computation need be done, the
+ * swash is stored there now.
+ * av[2] stores the multicharacter foldings, used later in
+ * regexec.c:S_reginclass().
+ * av[3] stores the nonbitmap inversion list for use in addition or
+ * instead of av[0]; not used if av[1] isn't NULL
+ * av[4] is set if any component of the class is from a user-defined
+ * property; not used if av[1] isn't NULL */
AV * const av = newAV();
SV *rv;
- /* The 0th element stores the character class description
- * in its textual form: used later (regexec.c:Perl_regclass_swash())
- * to initialize the appropriate swash (which gets stored in
- * element [1]), and also useful for dumping the regnode.
- * Element [2] stores the multicharacter foldings,
- * used later (regexec.c:S_reginclass()).
- * Element [3] stores the nonbitmap inversion list for use in addition
- * or instead of element [0].
- * Element [4] is currently FALSE */
+
av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
? &PL_sv_undef
: listsv);
- av_store(av, 1, NULL); /* Placeholder for generated swash */
+ if (swash) {
+ av_store(av, 1, swash);
+ SvREFCNT_dec(nonbitmap);
+ }
+ else {
+ av_store(av, 1, NULL);
if (nonbitmap) {
av_store(av, 3, nonbitmap);
- av_store(av, 4, newSVuv(0));
+ av_store(av, 4, newSVuv(has_user_defined_property));
}
+ }
/* Store any computed multi-char folds only if we are allowing
* them */
}
return ret;
}
-#undef _C_C_T_
/* reg_skipcomment()
PERL_ARGS_ASSERT_NEXTCHAR;
for (;;) {
- if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
- RExC_parse[2] == '#') {
+ if (RExC_end - RExC_parse >= 3
+ && *RExC_parse == '('
+ && RExC_parse[1] == '?'
+ && RExC_parse[2] == '#')
+ {
while (*RExC_parse != ')') {
if (RExC_parse == RExC_end)
FAIL("Sequence (?#... not terminated");
return(ret);
}
if (RExC_emit >= RExC_emit_bound)
- Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
+ Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
+ op, RExC_emit, RExC_emit_bound);
NODE_ALIGN_FILL(ret);
ptr = ret;
We can't do this:
assert(2==regarglen[op]+1);
-
+
Anything larger than this has to allocate the extra amount.
If we changed this to be:
return(ret);
}
if (RExC_emit >= RExC_emit_bound)
- Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op);
+ Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
+ op, RExC_emit, RExC_emit_bound);
NODE_ALIGN_FILL(ret);
ptr = ret;
for (;;) {
regnode * const temp = regnext(scan);
#ifdef EXPERIMENTAL_INPLACESCAN
- if (PL_regkind[OP(scan)] == EXACT)
- if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
+ if (PL_regkind[OP(scan)] == EXACT) {
+ bool has_exactf_sharp_s; /* Unexamined in this routine */
+ if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
return EXACT;
+ }
#endif
if ( exact ) {
switch (OP(scan)) {
case EXACTF:
case EXACTFA:
case EXACTFU:
+ case EXACTFU_SS:
+ case EXACTFU_NO_TRIE:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
} else if (k == LOGICAL)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
- else if (k == FOLDCHAR)
- Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) );
else if (k == ANYOF) {
int i, rangestart = -1;
const U8 flags = ANYOF_FLAGS(o);
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
sv_catpvs(sv, "^");
-
+
/* output what the standard cp 0-255 bitmap matches */
for (i = 0; i <= 256; i++) {
if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
if (ANYOF_NONBITMAP(o)) {
SV *lv; /* Set if there is something outside the bit map */
SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
-
+ bool byte_output = FALSE; /* If something in the bitmap has been
+ output */
+
if (lv && lv != &PL_sv_undef) {
if (sw) {
U8 s[UTF8_MAXBYTES_CASE+1];
for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
uvchr_to_utf8(s, i);
-
- if (i < 256 && swash_fetch(sw, s, TRUE)) {
+
+ if (i < 256
+ && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
+ things already
+ output as part
+ of the bitmap */
+ && swash_fetch(sw, s, TRUE))
+ {
if (rangestart == -1)
rangestart = i;
} else if (rangestart != -1) {
+ byte_output = TRUE;
if (i <= rangestart + 3)
for (; rangestart < i; rangestart++) {
- const U8 * const e = uvchr_to_utf8(s,rangestart);
- U8 *p;
- for(p = s; p < e; p++)
- put_byte(sv, *p);
+ put_byte(sv, rangestart);
}
else {
- const U8 *e = uvchr_to_utf8(s,rangestart);
- U8 *p;
- for (p = s; p < e; p++)
- put_byte(sv, *p);
+ put_byte(sv, rangestart);
sv_catpvs(sv, "-");
- e = uvchr_to_utf8(s, i-1);
- for (p = s; p < e; p++)
- put_byte(sv, *p);
- }
- rangestart = -1;
+ put_byte(sv, i-1);
}
+ rangestart = -1;
}
-
- sv_catpvs(sv, "..."); /* et cetera */
+ }
}
{
char *s = savesvpv(lv);
char * const origs = s;
-
+
while (*s && *s != '\n')
s++;
-
+
if (*s == '\n') {
const char * const t = ++s;
-
+
+ if (byte_output) {
+ sv_catpvs(sv, " ");
+ }
+
while (*s) {
- if (*s == '\n')
+ if (*s == '\n') {
+
+ /* Truncate very long output */
+ if (s - origs > 256) {
+ Perl_sv_catpvf(aTHX_ sv,
+ "%.*s...",
+ (int) (s - origs - 1),
+ t);
+ goto out_dump;
+ }
*s = ' ';
+ }
+ else if (*s == '\t') {
+ *s = '-';
+ }
s++;
}
if (s[-1] == ' ')
s[-1] = 0;
-
+
sv_catpv(sv, t);
}
-
+
+ out_dump:
+
Safefree(origs);
}
SvREFCNT_dec(lv);
}
#endif
-STATIC void
+STATIC void
S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
{
va_list args;
goto after_print;
} else
CLEAR_OPTSTART;
-
+
regprop(r, sv, node);
PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
(int)(2*indent + 1), "", SvPVX_const(sv));
sv_setpvs(sv, "");
for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
-
+
PerlIO_printf(Perl_debug_log, "%*s%s ",
(int)(2*(indent+3)), "",
elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,