* The adjacent nodes actually may be separated by NOTHING-kind nodes, and
* these get optimized out
*
+ * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
+ * as possible, even if that means splitting an existing node so that its first
+ * part is moved to the preceeding node. This would maximise the efficiency of
+ * memEQ during matching. Elsewhere in this file, khw proposes splitting
+ * EXACTFish nodes into portions that don't change under folding vs those that
+ * do. Those portions that don't change may be the only things in the pattern that
+ * could be used to find fixed and floating strings.
+ *
* If a node is to match under /i (folded), the number of characters it matches
* can be different than its character length if it contains a multi-character
* fold. *min_subtract is set to the total delta number of characters of the
}
#endif
-#ifdef PERL_ARGS_ASSERT__INVLISTEQ
+#if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
bool
Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
{
reparse:
/* We look for the EXACTFish to EXACT node optimizaton only if
- * folding. (And we don't need to figure this out until pass 2) */
+ * folding. (And we don't need to figure this out until pass 2).
+ * XXX It might actually make sense to split the node into portions
+ * that are exact and ones that aren't, so that we could later use
+ * the exact ones to find the longest fixed and floating strings.
+ * One would want to join them back into a larger node. One could
+ * use a pseudo regnode like 'EXACT_ORIG_FOLD' */
maybe_exact = FOLD && PASS2;
/* XXX The node can hold up to 255 bytes, yet this only goes to
* adjacent such nodes. And if the class is equivalent to things like /./,
* expensive run-time swashes can be avoided. Now that we have more
* complete information, we can find things necessarily missed by the
- * earlier code. */
+ * earlier code. Another possible "optimization" that isn't done is that
+ * something like [Ee] could be changed into an EXACTFU. khw tried this
+ * and found that the ANYOF is faster, including for code points not in the
+ * bitmap. This still might make sense to do, provided it got joined with
+ * an adjacent node(s) to create a longer EXACTFU one. This could be
+ * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
+ * routine would know is joinable. If that didn't happen, the node type
+ * could then be made a straight ANYOF */
if (optimizable && cp_list && ! invert) {
UV start, end;
invlist_iterfinish(cp_list);
if (op == END) {
+ const UV cp_list_len = _invlist_len(cp_list);
+ const UV* cp_list_array = invlist_array(cp_list);
/* Here, didn't find an optimization. See if this matches any of
* the POSIX classes. These run slightly faster for above-Unicode
* code points, so don't bother with POSIXA ones nor the 2 that
- * have no above-Unicode matches */
- for (posix_class = 0;
- posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
- posix_class++)
- {
- int try_inverted;
- if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
- continue;
- }
- for (try_inverted = 0; try_inverted < 2; try_inverted++) {
+ * have no above-Unicode matches. We can avoid these checks unless
+ * the ANYOF matches at least as high as the lowest POSIX one
+ * (which was manually found to be \v. The actual code point may
+ * increase in later Unicode releases, if a higher code point is
+ * assigned to be \v, but this code will never break. It would
+ * just mean we could execute the checks for posix optimizations
+ * unnecessarily) */
+
+ if (cp_list_array[cp_list_len-1] > 0x2029) {
+ for (posix_class = 0;
+ posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
+ posix_class++)
+ {
+ int try_inverted;
+ if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
+ continue;
+ }
+ for (try_inverted = 0; try_inverted < 2; try_inverted++) {
- /* Check if matches normal or inverted */
- if (_invlistEQ(cp_list,
- PL_XPosix_ptrs[posix_class],
- try_inverted))
- {
- op = (try_inverted)
- ? NPOSIXU
- : POSIXU;
- *flagp |= HASWIDTH|SIMPLE;
- goto found_posix;
+ /* Check if matches normal or inverted */
+ if (_invlistEQ(cp_list,
+ PL_XPosix_ptrs[posix_class],
+ try_inverted))
+ {
+ op = (try_inverted)
+ ? NPOSIXU
+ : POSIXU;
+ *flagp |= HASWIDTH|SIMPLE;
+ goto found_posix;
+ }
}
}
+ found_posix: ;
}
- found_posix: ;
}
+
if (op != END) {
RExC_parse = (char *)orig_parse;
RExC_emit = (regnode *)orig_emit;