#define invlist_set_len(a,b,c) S_invlist_set_len(aTHX_ a,b,c)
#define invlist_set_previous_index S_invlist_set_previous_index
#define invlist_trim S_invlist_trim
+#define is_ssc_worth_it S_is_ssc_worth_it
#define join_exact(a,b,c,d,e,f,g) S_join_exact(aTHX_ a,b,c,d,e,f,g)
#define make_trie(a,b,c,d,e,f,g,h) S_make_trie(aTHX_ a,b,c,d,e,f,g,h)
#define nextchar(a) S_nextchar(aTHX_ a)
Freeing REx: "[q]"
---
#Compiling REx "^(\S{1,9}):\s*(\d+)$"
-#synthetic stclass "ANYOF[\x{00}-\x{06}\a\b\x{0E}-\x{1F}\x{21}-\x{FF}][{utf8}0100-167F 1681-1FFF 200B-2027 202A-202E 2030-205E 2060-2FFF 3001-INFINITY]".
#Final program:
# 1: SBOL (2)
# 2: OPEN1 (4)
# 17: CLOSE2 (19)
# 19: EOL (20)
# 20: END (0)
-#floating ":" at 1..9 (checking floating) stclass ANYOF[\x{00}-\x{06}\a\b\x{0E}-\x{1F}\x{21}-\x{FF}][{utf8}0100-167F 1681-1FFF 200B-2027 202A-202E 2030-205E 2060-2FFF 3001-INFINITY] anchored(SBOL) minlen 3
#Freeing REx: "^(\S{1,9}):\s*(\d+)$"
-floating ":" at 1..9 (checking floating) stclass ANYOF[\x{00}-\x{06}\a\b\x{0E}-\x{1F}\x{21}-\x{FF}][{utf8}0100-167F 1681-1FFF 200B-2027 202A-202E 2030-205E 2060-2FFF 3001-INFINITY] anchored(SBOL) minlen 3
%MATCHED%
-synthetic stclass
+Freeing REx: "^(\S{1,9}):\s*(\d+)$"
---
#Compiling REx "(?(DEFINE)(?<foo>foo))(?(DEFINE)(?<bar>(?&foo)bar))(?(DEFINE"...
#Got 532 bytes for offset annotations.
#define PERL_ARGS_ASSERT_INVLIST_TRIM \
assert(invlist)
+STATIC bool S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
+#define PERL_ARGS_ASSERT_IS_SSC_WORTH_IT \
+ assert(pRExC_state); assert(ssc)
+
STATIC U32 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *unfolded_multi_char, U32 flags, regnode *val, U32 depth)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2)
ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
}
+#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
+
+STATIC bool
+S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
+{
+ /* The synthetic start class is used to hopefully quickly winnow down
+ * places where a pattern could start a match in the target string. If it
+ * doesn't really narrow things down that much, there isn't much point to
+ * having the overhead of using it. This function uses some very crude
+ * heuristics to decide if to use the ssc or not.
+ *
+ * It returns TRUE if 'ssc' rules out more than half what it considers to
+ * be the "likely" possible matches, but of course it doesn't know what the
+ * actual things being matched are going to be; these are only guesses
+ *
+ * For /l matches, it assumes that the only likely matches are going to be
+ * in the 0-255 range, uniformly distributed, so half of that is 127
+ * For /a and /d matches, it assumes that the likely matches will be just
+ * the ASCII range, so half of that is 63
+ * For /u and there isn't anything matching above the Latin1 range, it
+ * assumes that that is the only range likely to be matched, and uses
+ * half that as the cut-off: 127. If anything matches above Latin1,
+ * it assumes that all of Unicode could match (uniformly), except for
+ * non-Unicode code points and things in the General Category "Other"
+ * (unassigned, private use, surrogates, controls and formats). This
+ * is a much large number. */
+
+ const U32 max_match = (LOC)
+ ? 127
+ : (! UNI_SEMANTICS)
+ ? 63
+ : (invlist_highest(ssc->invlist) < 256)
+ ? 127
+ : ((NON_OTHER_COUNT + 1) / 2) - 1;
+ U32 count = 0; /* Running total of number of code points matched by
+ 'ssc' */
+ UV start, end; /* Start and end points of current range in inversion
+ list */
+
+ PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
+
+ invlist_iterinit(ssc->invlist);
+ while (invlist_iternext(ssc->invlist, &start, &end)) {
+
+ /* /u is the only thing that we expect to match above 255; so if not /u
+ * and even if there are matches above 255, ignore them. This catches
+ * things like \d under /d which does match the digits above 255, but
+ * since the pattern is /d, it is not likely to be expecting them */
+ if (! UNI_SEMANTICS) {
+ if (start > 255) {
+ break;
+ }
+ end = MIN(end, 255);
+ }
+ count += end - start + 1;
+ if (count > max_match) {
+ invlist_iterfinish(ssc->invlist);
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+
STATIC void
S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
{
if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
&& stclass_flag
&& ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
- && !ssc_is_anything(data.start_class))
+ && is_ssc_worth_it(pRExC_state, data.start_class))
{
const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
= r->float_substr = r->float_utf8 = NULL;
if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
- && ! ssc_is_anything(data.start_class))
+ && is_ssc_worth_it(pRExC_state, data.start_class))
{
const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));