}
/* Add in the points from the bit map */
- for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
- if (ANYOF_BITMAP_TEST(node, i)) {
- unsigned int start = i++;
+ if (OP(node) != ANYOFH) {
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+ if (ANYOF_BITMAP_TEST(node, i)) {
+ unsigned int start = i++;
- for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
- /* empty */
+ for (; i < NUM_ANYOF_CODE_POINTS
+ && ANYOF_BITMAP_TEST(node, i); ++i)
+ {
+ /* empty */
+ }
+ invlist = _add_range_to_invlist(invlist, start, i-1);
+ new_node_has_latin1 = TRUE;
}
- invlist = _add_range_to_invlist(invlist, start, i-1);
- new_node_has_latin1 = TRUE;
}
}
* that a character in the pattern corresponds to at most a single
* character in the target string. (And I do mean character, and not byte
* here, unlike other parts of the documentation that have never been
- * updated to account for multibyte Unicode.) sharp s in EXACTF and
+ * updated to account for multibyte Unicode.) Sharp s in EXACTF and
* EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
* nodes it can match "\x{17F}\x{17F}". These, along with other ones in
* EXACTFL nodes, violate the assumption, and they are the only instances
}
#endif
}
+
+ if ( STR_LEN(scan) == 1
+ && isALPHA_A(* STRING(scan))
+ && ( OP(scan) == EXACTFAA
+ || ( OP(scan) == EXACTFU
+ && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan)))))
+ {
+ U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
+
+ /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
+ * with the mask set to the complement of the bit that differs
+ * between upper and lower case, and the lowest code point of the
+ * pair (which the '&' forces) */
+ OP(scan) = ANYOFM;
+ ARG_SET(scan, *STRING(scan) & mask);
+ FLAGS(scan) = mask;
+ }
}
#ifdef DEBUGGING
OP(next) = EXACTFU;
}
+ if ( STR_LEN(next) == 1
+ && isALPHA_A(* STRING(next))
+ && ( OP(next) == EXACTFAA
+ || ( OP(next) == EXACTFU
+ && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next)))))
+ {
+ /* These differ in just one bit */
+ U8 mask = ~ ('A' ^ 'a');
+
+ assert(isALPHA_A(* STRING(next)));
+
+ /* Then replace it by an ANYOFM node, with
+ * the mask set to the complement of the
+ * bit that differs between upper and lower
+ * case, and the lowest code point of the
+ * pair (which the '&' forces) */
+ OP(next) = ANYOFM;
+ ARG_SET(next, *STRING(next) & mask);
+ FLAGS(next) = mask;
+ }
+
if (flags & SCF_DO_STCLASS) {
mincount = 0;
maxcount = REG_INFTY;
case ANYOFD:
case ANYOFL:
case ANYOFPOSIXL:
+ case ANYOFH:
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
ssc_and(pRExC_state, data->start_class,
}
#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
- int num; \
if (RExC_lastparse!=RExC_parse) { \
Perl_re_printf( aTHX_ "%s", \
Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
} else \
Perl_re_printf( aTHX_ "%16s",""); \
\
- num=REG_NODE_NUM(REGNODE_p(RExC_emit)); \
- if (RExC_lastnum!=num) \
- Perl_re_printf( aTHX_ "|%4d", num); \
+ if (RExC_lastnum!=RExC_emit) \
+ Perl_re_printf( aTHX_ "|%4d", RExC_emit); \
else \
Perl_re_printf( aTHX_ "|%4s",""); \
Perl_re_printf( aTHX_ "|%*s%-4s", \
(int)((depth*2)), "", \
(funcname) \
); \
- RExC_lastnum=num; \
+ RExC_lastnum=RExC_emit; \
RExC_lastparse=RExC_parse; \
})
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
"%*s%*s Setting open paren #%" IVdf " to %d\n",
22, "| |", (int)(depth * 2 + 1), "",
- (IV)parno, REG_NODE_NUM(REGNODE_p(ret))));
+ (IV)parno, ret));
RExC_open_parens[parno]= ret;
}
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
"%*s%*s Setting close paren #%" IVdf " to %d\n",
22, "| |", (int)(depth * 2 + 1), "",
- (IV)parno, REG_NODE_NUM(REGNODE_p(ender))));
+ (IV)parno, ender));
RExC_close_parens[parno]= ender;
if (RExC_nestroot == parno)
RExC_nestroot = 0;
DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
"%*s%*s Setting close paren #0 (END) to %d\n",
22, "| |", (int)(depth * 2 + 1), "",
- REG_NODE_NUM(REGNODE_p(ender))));
+ ender));
RExC_close_parens[0]= ender;
}
regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n",
SvPV_nolen_const(RExC_mysv1),
- (IV)REG_NODE_NUM(REGNODE_p(lastbr)),
+ (IV)lastbr,
SvPV_nolen_const(RExC_mysv2),
- (IV)REG_NODE_NUM(REGNODE_p(ender)),
+ (IV)ender,
(IV)(ender - lastbr)
);
);
SvPV_nolen_const(RExC_mysv1),
(IV)REG_NODE_NUM(ret_as_regnode),
SvPV_nolen_const(RExC_mysv2),
- (IV)REG_NODE_NUM(REGNODE_p(ender)),
+ (IV)ender,
(IV)(ender - ret)
);
);
return op + EXACTF;
}
-PERL_STATIC_INLINE void
-S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
- regnode_offset node, I32* flagp, STRLEN len,
- UV code_point, bool downgradable)
-{
- /* This knows the details about sizing an EXACTish node, setting flags for
- * it (by setting <*flagp>, and potentially populating it with a single
- * character.
- *
- * If <len> (the length in bytes) is non-zero, this function assumes that
- * the node has already been populated, and just does the sizing. In this
- * case <code_point> should be the final code point that has already been
- * placed into the node. This value will be ignored except that under some
- * circumstances <*flagp> is set based on it.
- *
- * If <len> is zero, the function assumes that the node is to contain only
- * the single character given by <code_point> and calculates what <len>
- * should be. It populates the node's STRING with <code_point> or its
- * fold if folding.
- *
- * In both cases <*flagp> is appropriately set
- *
- * It knows that under FOLD, the Latin Sharp S and UTF characters above
- * 255, must be folded (the former only when the rules indicate it can
- * match 'ss')
- *
- * When it does the populating, it looks at the flag 'downgradable'. If
- * true with a node that folds, it checks if the single code point
- * participates in a fold, and if not downgrades the node to an EXACT.
- * This helps the optimizer */
-
- bool len_passed_in = cBOOL(len != 0);
- U8 character[UTF8_MAXBYTES_CASE+1];
-
- PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
-
- if (! len_passed_in) {
- if (UTF) {
- if (UVCHR_IS_INVARIANT(code_point)) {
- if (LOC || ! FOLD) { /* /l defers folding until runtime */
- *character = (U8) code_point;
- }
- else { /* Here is /i and not /l. */
- *character = toFOLD((U8) code_point);
-
- /* We can downgrade to an EXACT node if this character
- * isn't a folding one. Note that this assumes that
- * nothing above Latin1 folds to some other invariant than
- * one of these alphabetics; otherwise we would also have
- * to check:
- * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
- * || ASCII_FOLD_RESTRICTED))
- */
- if (downgradable && PL_fold[code_point] == code_point) {
- OP(REGNODE_p(node)) = EXACT;
- }
- }
- len = 1;
- }
- else if (FOLD && ( ! LOC
- || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
- { /* Folding, and ok to do so now */
- UV folded = _to_uni_fold_flags(
- code_point,
- character,
- &len,
- FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0));
- if (downgradable
- && folded == code_point /* This quickly rules out many
- cases, avoiding the
- _invlist_contains_cp() overhead
- for those. */
- && ! _invlist_contains_cp(PL_in_some_fold, code_point))
- {
- OP(REGNODE_p(node)) = (LOC)
- ? EXACTL
- : EXACT;
- }
- }
- else if (code_point <= MAX_UTF8_TWO_BYTE) {
-
- /* Not folding this cp, and can output it directly */
- *character = UTF8_TWO_BYTE_HI(code_point);
- *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
- len = 2;
- }
- else {
- uvchr_to_utf8( character, code_point);
- len = UTF8SKIP(character);
- }
- } /* Else pattern isn't UTF8. */
- else if (! FOLD) {
- *character = (U8) code_point;
- len = 1;
- } /* Else is folded non-UTF8 */
-#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
- || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
- || UNICODE_DOT_DOT_VERSION > 0)
- else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
-#else
- else if (1) {
-#endif
- *character = (U8) (DEPENDS_SEMANTICS)
- ? toFOLD(code_point)
- : (LOC)
- ? code_point
- : toLOWER_L1(code_point);
- len = 1;
-
- /* Can turn into an EXACT node if we know the fold at compile time,
- * and it folds to itself and doesn't particpate in other folds */
- if (downgradable
- && ! LOC
- && PL_fold_latin1[code_point] == code_point
- && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
- || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
- {
- OP(REGNODE_p(node)) = EXACT;
- }
- } /* else is Sharp s. May need to fold it */
- else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
- *character = 's';
- *(character + 1) = 's';
- len = 2;
- }
- else {
- *character = LATIN_SMALL_LETTER_SHARP_S;
- len = 1;
- }
- }
-
- if (downgradable) {
- change_engine_size(pRExC_state, STR_SZ(len));
- }
-
- RExC_emit += STR_SZ(len);
- STR_LEN(REGNODE_p(node)) = len;
- if (! len_passed_in) {
- Copy((char *) character, STRING(REGNODE_p(node)), len, char);
- }
-
- *flagp |= HASWIDTH;
-
- /* A single character node is SIMPLE, except for the special-cased SHARP S
- * under /di. */
- if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
-#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
- || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
- || UNICODE_DOT_DOT_VERSION > 0)
- && ( code_point != LATIN_SMALL_LETTER_SHARP_S
- || ! FOLD || ! DEPENDS_SEMANTICS)
-#endif
- ) {
- *flagp |= SIMPLE;
- }
-
- if (OP(REGNODE_p(node)) == EXACTFL) {
- RExC_contains_locale = 1;
- }
-}
-
STATIC bool
S_new_regcurly(const char *s, const char *e)
{
/* FALLTHROUGH */
case 'b':
{
+ U8 flags = 0;
regex_charset charset = get_regex_charset(RExC_flags);
RExC_seen_zerolen++;
RExC_seen |= REG_LOOKBEHIND_SEEN;
op = BOUND + charset;
- if (op == BOUND) {
- RExC_seen_d_op = TRUE;
- }
- else if (op == BOUNDL) {
- RExC_contains_locale = 1;
- }
-
- ret = reg_node(pRExC_state, op);
- *flagp |= SIMPLE;
if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
- FLAGS(REGNODE_p(ret)) = TRADITIONAL_BOUND;
+ flags = TRADITIONAL_BOUND;
if (op > BOUNDA) { /* /aa is same as /a */
- OP(REGNODE_p(ret)) = BOUNDA;
+ op = BOUNDA;
}
}
else {
{
goto bad_bound_type;
}
- FLAGS(REGNODE_p(ret)) = GCB_BOUND;
+ flags = GCB_BOUND;
break;
case 'l':
if (length != 2 || *(RExC_parse + 1) != 'b') {
goto bad_bound_type;
}
- FLAGS(REGNODE_p(ret)) = LB_BOUND;
+ flags = LB_BOUND;
break;
case 's':
if (length != 2 || *(RExC_parse + 1) != 'b') {
goto bad_bound_type;
}
- FLAGS(REGNODE_p(ret)) = SB_BOUND;
+ flags = SB_BOUND;
break;
case 'w':
if (length != 2 || *(RExC_parse + 1) != 'b') {
goto bad_bound_type;
}
- FLAGS(REGNODE_p(ret)) = WB_BOUND;
+ flags = WB_BOUND;
break;
default:
bad_bound_type:
RExC_parse = endbrace;
REQUIRE_UNI_RULES(flagp, 0);
- if (op >= BOUNDA) { /* /aa is same as /a */
- OP(REGNODE_p(ret)) = BOUNDU;
+ if (op == BOUND) {
+ op = BOUNDU;
+ }
+ else if (op >= BOUNDA) { /* /aa is same as /a */
+ op = BOUNDU;
length += 4;
/* Don't have to worry about UTF-8, in this message because
}
}
+ if (op == BOUND) {
+ RExC_seen_d_op = TRUE;
+ }
+ else if (op == BOUNDL) {
+ RExC_contains_locale = 1;
+ }
+
if (invert) {
- OP(REGNODE_p(ret)) += NBOUND - BOUND;
+ op += NBOUND - BOUND;
}
+
+ ret = reg_node(pRExC_state, op);
+ FLAGS(REGNODE_p(ret)) = flags;
+
+ *flagp |= SIMPLE;
+
goto finish_meta_pat;
}
/* We can convert EXACTF nodes to EXACTFU if they contain only
* characters that match identically regardless of the target
* string's UTF8ness. The reason to do this is that EXACTF is not
- * trie-able, EXACTFU is.
+ * trie-able, EXACTFU is, and EXACTFU requires fewer operations at
+ * runtime.
*
* Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
* contain only above-Latin1 characters (hence must be in UTF8),
goto loopdone;
}
- /* This code point means we can't simplify things */
+ /* This problematic code point means we can't simplify
+ * things */
maybe_exactfu = FALSE;
/* Here, we are adding a problematic fold character.
* identifies, so when it is set to less than the full node, we can
* skip the rest of this */
if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
+ PERL_UINT_FAST8_T backup_count = 0;
const STRLEN full_len = len;
assert(len >= MAX_NODE_STRING_SIZE);
- /* Here, <s> points to the final byte of the final character.
- * Look backwards through the string until find a non-
- * problematic character */
+ /* Here, <s> points to just beyond where we have output the
+ * final character of the node. Look backwards through the
+ * string until find a non- problematic character */
if (! UTF) {
goto loopdone;
}
- while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
+ while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
+ backup_count++;
+ }
len = s - s0 + 1;
}
else {
* special case the very first byte in the string, so
* we don't read outside the string */
s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
+ backup_count++;
} /* End of loop backwards through the string */
/* If there were only problematic characters in the string,
} else {
/* Here, the node does contain some characters that aren't
- * problematic. If one such is the final character in the
- * node, we are done */
- if (len == full_len) {
+ * problematic. If we didn't have to backup any, then the
+ * final character in the node is non-problematic, and we
+ * can take the node as-is */
+ if (backup_count == 0) {
goto loopdone;
}
- else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
+ else if (backup_count == 1) {
/* If the final character is problematic, but the
* penultimate is not, back-off that last character to
PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
assert(PL_regkind[OP(node)] == ANYOF);
+ /* There is no bitmap for this node type */
+ if (OP(node) == ANYOFH) {
+ return;
+ }
+
ANYOF_BITMAP_ZERO(node);
if (*invlist_ptr) {
STRLEN numlen;
int namedclass = OOB_NAMEDCLASS;
char *rangebegin = NULL;
- bool need_class = 0;
SV *listsv = NULL;
STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
than just initialized. */
* time */
SV* swash = NULL; /* Code points that match \p{} \P{} */
- /* Set if a component of this character class is user-defined; just passed
- * on to the engine */
- bool has_user_defined_property = FALSE;
-
/* inversion list of code points this node matches only when the target
* string is in UTF-8. These are all non-ASCII, < 256. (Because is under
* /d) */
bool warn_super = ALWAYS_WARN_SUPER;
const char * orig_parse = RExC_parse;
- bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
/* This variable is used to mark where the end in the input is of something
* that looks like a POSIX construct but isn't. During the parse, when
one. */
U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
U32 posixl = 0; /* bit field of posix classes matched under /l */
- bool use_anyofd = FALSE; /* ? Is this to be an ANYOFD node */
+
+
+/* Flags as to what things aren't knowable until runtime. (Note that these are
+ * mutually exclusive.) */
+#define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that
+ haven't been defined as of yet */
+#define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is
+ UTF-8 or not */
+#define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and
+ what gets folded */
+ U32 has_runtime_dependency = 0; /* OR of the above flags */
GET_RE_DEBUG_FLAGS_DECL;
(FOLD) ? "__" : "",
UTF8fARG(UTF, n, name),
(FOLD) ? "_i" : "");
- has_user_defined_property = TRUE;
- optimizable = FALSE; /* Will have to leave this an
- ANYOF node */
+ has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
/* We don't know yet what this matches, so have to flag
* it */
if (swash_init_flags
& _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
{
- has_user_defined_property = TRUE;
+ has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
}
}
}
if (invlist) {
- if (! has_user_defined_property &&
+ if (! (has_runtime_dependency
+ & HAS_USER_DEFINED_PROPERTY) &&
/* We warn on matching an above-Unicode code point
* if the match would return true, except don't
* warn for \p{All}, which has exactly one element
SV* scratch_list = NULL;
/* What the Posix classes (like \w, [:space:]) match in locale
- * isn't knowable under locale until actual match time. Room
- * must be reserved (one time per outer bracketed class) to
- * store such classes. The space will contain a bit for each
- * named class that is to be matched against. This isn't
- * needed for \p{} and pseudo-classes, as they are not affected
- * by locale, and hence are dealt with separately */
- if (! need_class) {
- need_class = 1;
- anyof_flags |= ANYOF_MATCHES_POSIXL;
-
- /* We can't change this into some other type of node
- * (unless this is the only element, in which case there
- * are nodes that mean exactly this) as has runtime
- * dependencies */
- optimizable = FALSE;
- }
-
- /* Coverity thinks it is possible for this to be negative; both
- * jhi and khw think it's not, but be safer */
- assert(! (anyof_flags & ANYOF_MATCHES_POSIXL)
- || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
-
- /* See if it already matches the complement of this POSIX
- * class */
- if ( (anyof_flags & ANYOF_MATCHES_POSIXL)
- && POSIXL_TEST(posixl, namedclass + ((namedclass % 2)
- ? -1
- : 1)))
- {
- posixl_matches_all = TRUE;
- break; /* No need to continue. Since it matches both
- e.g., \w and \W, it matches everything, and the
- bracketed class can be optimized into qr/./s */
- }
-
- /* Add this class to those that should be checked at runtime */
+ * isn't knowable under locale until actual match time. A
+ * special node is used for these which has extra space for a
+ * bitmap, with a bit reserved for each named class that is to
+ * be matched against. This isn't needed for \p{} and
+ * pseudo-classes, as they are not affected by locale, and
+ * hence are dealt with separately */
POSIXL_SET(posixl, namedclass);
+ has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
+ anyof_flags |= ANYOF_MATCHES_POSIXL;
/* The above-Latin1 characters are not subject to locale rules.
* Just add them to the unconditionally-matched list */
*
* See [perl #89750] */
if (FOLD && allow_multi_folds && value == prevvalue) {
- if (value == LATIN_SMALL_LETTER_SHARP_S
+ if ( value == LATIN_SMALL_LETTER_SHARP_S
|| (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
value)))
{
* the target string is in UTF-8. But things like \W match all the
* upper Latin1 characters if the target string is not in UTF-8.
*
- * Handle the case where there something like \W separately */
+ * Handle the case with something like \W separately */
if (nposixes) {
SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL);
SvREFCNT_dec(nonascii_but_latin1_properties);
- /* Get rid of any characters that we now know are matched
- * unconditionally from the conditional list, which may make
- * that list empty */
+ /* Get rid of any characters from the conditional list that we
+ * now know are matched unconditionally, which may make that
+ * list empty */
_invlist_subtract(upper_latin1_only_utf8_matches,
cp_list,
&upper_latin1_only_utf8_matches);
}
}
if (only_utf8_locale_list) {
+ has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
anyof_flags
|= ANYOFL_FOLD
| ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
invlist_iterinit(cp_list);
if (invlist_iternext(cp_list, &start, &end) && start < 256) {
anyof_flags |= ANYOFL_FOLD;
+ has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
}
invlist_iterfinish(cp_list);
}
&& ( upper_latin1_only_utf8_matches
|| (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
{
- use_anyofd = TRUE;
RExC_seen_d_op = TRUE;
- optimizable = FALSE;
+ has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
}
- /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
- * at compile time. Besides not inverting folded locale now, we can't
- * invert if there are things such as \w, which aren't known until runtime
- * */
+ /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at
+ * compile time. */
if ( cp_list
&& invert
- && ! use_anyofd
- && ! (anyof_flags & (ANYOF_LOCALE_FLAGS))
- && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
+ && ! has_runtime_dependency)
{
_invlist_invert(cp_list);
swash = NULL;
}
- /* Clear the invert flag since have just done it here */
- invert = FALSE;
+ invert = FALSE;
}
if (ret_invlist) {
return RExC_emit;
}
+ /* All possible optimizations below still have these characteristics.
+ * (Multi-char folds aren't SIMPLE, but they don't get this far in this
+ * routine) */
+ *flagp |= HASWIDTH|SIMPLE;
+
+ if (anyof_flags & ANYOF_LOCALE_FLAGS) {
+ RExC_contains_locale = 1;
+ }
+
/* Some character classes are equivalent to other nodes. Such nodes take
- * up less room and generally fewer operations to execute than ANYOF nodes.
- * */
+ * up less room, and some nodes require fewer operations to execute, than
+ * ANYOF nodes. EXACTish nodes may be joinable with adjacent nodes to
+ * improve efficiency. */
if (optimizable) {
- int posix_class = -1; /* Illegal value */
- UV start, end;
+ PERL_UINT_FAST8_T i;
+ Size_t partial_cp_count = 0;
+ UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
+ UV end[MAX_FOLD_FROMS+1] = { 0 };
+
+ if (cp_list) { /* Count the code points in enough ranges that we would
+ see all the ones possible in any fold in this version
+ of Unicode */
+
+ invlist_iterinit(cp_list);
+ for (i = 0; i <= MAX_FOLD_FROMS; i++) {
+ if (invlist_iternext(cp_list, &start[i], &end[i])) {
+ partial_cp_count += end[i] - start[i] + 1;
+ }
+ }
+
+ invlist_iterfinish(cp_list);
+ }
- if (UNLIKELY(posixl_matches_all)) {
- ret = reg_node(pRExC_state, SANY);
+ /* If we know at compile time that this matches every possible code
+ * point, any run-time dependencies don't matter */
+ if (start[0] == 0 && end[0] == UV_MAX) {
+ if (invert) {
+ ret = reganode(pRExC_state, OPFAIL, 0);
+ }
+ else {
+ ret = reg_node(pRExC_state, SANY);
+ MARK_NAUGHTY(1);
+ }
goto not_anyof;
}
- if (cp_list && ! invert) {
- invlist_iterinit(cp_list);
- if (! invlist_iternext(cp_list, &start, &end)) {
+ /* Similarly, for /l posix classes, if both a class and its
+ * complement match, any run-time dependencies don't matter */
+ if (posixl) {
+ for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
+ namedclass += 2)
+ {
+ if ( POSIXL_TEST(posixl, namedclass) /* class */
+ && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
+ {
+ if (invert) {
+ ret = reganode(pRExC_state, OPFAIL, 0);
+ }
+ else {
+ ret = reg_node(pRExC_state, SANY);
+ MARK_NAUGHTY(1);
+ }
+ goto not_anyof;
+ }
+ }
+ /* For well-behaved locales, some classes are subsets of others,
+ * so complementing the subset and including the non-complemented
+ * superset should match everything, like [\D[:alnum:]], and
+ * [[:^alpha:][:alnum:]], but some implementations of locales are
+ * buggy, and khw thinks its a bad idea to have optimization change
+ * behavior, even if it avoids an OS bug in a given case */
+
+#define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
+
+ /* If is a single posix /l class, can optimize to just that op.
+ * Such a node will not match anything in the Latin1 range, as that
+ * is not determinable until runtime, but will match whatever the
+ * class does outside that range. (Note that some classes won't
+ * match anything outside the range, like [:ascii:]) */
+ if ( isSINGLE_BIT_SET(posixl)
+ && (partial_cp_count == 0 || start[0] > 255))
+ {
+ U8 classnum;
+ SV * class_above_latin1 = NULL;
+ bool already_inverted;
+ bool are_equivalent;
+
+ /* Compute which bit is set, which is the same thing as, e.g.,
+ * ANYOF_CNTRL. From
+ * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
+ * */
+ static const int MultiplyDeBruijnBitPosition2[32] =
+ {
+ 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+ 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+ };
+
+ namedclass = MultiplyDeBruijnBitPosition2[(posixl
+ * 0x077CB531U) >> 27];
+ classnum = namedclass_to_classnum(namedclass);
+
+ /* The named classes are such that the inverted number is one
+ * larger than the non-inverted one */
+ already_inverted = namedclass
+ - classnum_to_namedclass(classnum);
+
+ /* Create an inversion list of the official property, inverted
+ * if the constructed node list is inverted, and restricted to
+ * only the above latin1 code points, which are the only ones
+ * known at compile time */
+ _invlist_intersection_maybe_complement_2nd(
+ PL_AboveLatin1,
+ PL_XPosix_ptrs[classnum],
+ already_inverted,
+ &class_above_latin1);
+ are_equivalent = _invlistEQ(class_above_latin1, cp_list,
+ FALSE);
+ SvREFCNT_dec_NN(class_above_latin1);
+
+ if (are_equivalent) {
+
+ /* Resolve the run-time inversion flag with this possibly
+ * inverted class */
+ invert = invert ^ already_inverted;
+
+ ret = reg_node(pRExC_state,
+ POSIXL + invert * (NPOSIXL - POSIXL));
+ FLAGS(REGNODE_p(ret)) = classnum;
+ goto not_anyof;
+ }
+ }
+ }
+
+ /* khw can't think of any other possible transformation involving
+ * these. */
+ if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
+ goto is_anyof;
+ }
- /* Here, the list is empty. This happens, for example, when a
- * Unicode property that doesn't match anything is the only
- * element in the character class (perluniprops.pod notes such
- * properties). */
+ if (! has_runtime_dependency) {
+
+ /* If the list is empty, nothing matches. This happens, for
+ * example, when a Unicode property that doesn't match anything is
+ * the only element in the character class (perluniprops.pod notes
+ * such properties). */
+ if (partial_cp_count == 0) {
+ assert (! invert);
ret = reganode(pRExC_state, OPFAIL, 0);
- *flagp |= HASWIDTH|SIMPLE;
goto not_anyof;
}
- if (start == end) { /* The range is a single code point */
- if (! invlist_iternext(cp_list, &start, &end)
+ /* If matches everything but \n */
+ if ( start[0] == 0 && end[0] == '\n' - 1
+ && start[1] == '\n' + 1 && end[1] == UV_MAX)
+ {
+ assert (! invert);
+ ret = reg_node(pRExC_state, REG_ANY);
+ MARK_NAUGHTY(1);
+ goto not_anyof;
+ }
+ }
- /* Don't do this optimization if it would require
- * changing the pattern to UTF-8 */
- && (start < 256 || UTF))
- {
- /* Here, the list contains a single code point. Can
- * optimize into an EXACTish node */
+ /* Next see if can optimize classes that contain just a few code points
+ * into an EXACTish node. The reason to do this is to let the
+ * optimizer join this node with adjacent EXACTish ones.
+ *
+ * An EXACTFish node can be generated even if not under /i, and vice
+ * versa. But care must be taken. An EXACTFish node has to be such
+ * that it only matches precisely the code points in the class, but we
+ * want to generate the least restrictive one that does that, to
+ * increase the odds of being able to join with an adjacent node. For
+ * example, if the class contains [kK], we have to make it an EXACTFAA
+ * node to prevent the KELVIN SIGN from matching. Whether we are under
+ * /i or not is irrelevant in this case. Less obvious is the pattern
+ * qr/[\x{02BC}]n/i. U+02BC is MODIFIER LETTER APOSTROPHE. That is
+ * supposed to match the single character U+0149 LATIN SMALL LETTER N
+ * PRECEDED BY APOSTROPHE. And so even though there is no simple fold
+ * that includes \X{02BC}, there is a multi-char fold that does, and so
+ * the node generated for it must be an EXACTFish one. On the other
+ * hand qr/:/i should generate a plain EXACT node since the colon
+ * participates in no fold whatsoever, and having it EXACT tells the
+ * optimizer the target string cannot match unless it has a colon in
+ * it.
+ *
+ * We don't typically generate an EXACTish node if doing so would
+ * require changing the pattern to UTF-8, as that affects /d and
+ * otherwise is slower. However, under /i, not changing to UTF-8 can
+ * miss some potential multi-character folds. We calculate the
+ * EXACTish node, and then decide if something would be missed if we
+ * don't upgrade */
+ if ( ! posixl
+ && ! invert
+
+ /* Only try if there are no more code points in the class than
+ * in the max possible fold */
+ && partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1
+
+ && (start[0] < 256 || UTF || FOLD))
+ {
+ if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
+ {
+ /* We can always make a single code point class into an
+ * EXACTish node. */
+
+ if (LOC) {
+
+ /* Here is /l: Use EXACTL, except /li indicates EXACTFL,
+ * as that means there is a fold not known until runtime so
+ * shows as only a single code point here. */
+ op = (FOLD) ? EXACTFL : EXACTL;
+ }
+ else if (! FOLD) { /* Not /l and not /i */
+ op = (start[0] < 256) ? EXACT : EXACT_ONLY8;
+ }
+ else if (start[0] < 256) { /* /i, not /l, and the code point is
+ small */
+
+ /* Under /i, it gets a little tricky. A code point that
+ * doesn't participate in a fold should be an EXACT node.
+ * We know this one isn't the result of a simple fold, or
+ * there'd be more than one code point in the list, but it
+ * could be part of a multi- character fold. In that case
+ * we better not create an EXACT node, as we would wrongly
+ * be telling the optimizer that this code point must be in
+ * the target string, and that is wrong. This is because
+ * if the sequence around this code point forms a
+ * multi-char fold, what needs to be in the string could be
+ * the code point that folds to the sequence.
+ *
+ * This handles the case of below-255 code points, as we
+ * have an easy look up for those. The next clause handles
+ * the above-256 one */
+ op = IS_IN_SOME_FOLD_L1(start[0])
+ ? EXACTFU
+ : EXACT;
+ }
+ else { /* /i, larger code point. Since we are under /i, and
+ have just this code point, we know that it can't
+ fold to something else, so PL_InMultiCharFold
+ applies to it */
+ op = _invlist_contains_cp(PL_InMultiCharFold,
+ start[0])
+ ? EXACTFU_ONLY8
+ : EXACT_ONLY8;
+ }
+
+ value = start[0];
+ }
+ else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
+ && _invlist_contains_cp(PL_in_some_fold, start[0]))
+ {
+ /* Here, the only runtime dependency, if any, is from /d, and
+ * the class matches more than one code point, and the lowest
+ * code point participates in some fold. It might be that the
+ * other code points are /i equivalent to this one, and hence
+ * they would representable by an EXACTFish node. Above, we
+ * eliminated classes that contain too many code points to be
+ * EXACTFish, with the test for MAX_FOLD_FROMS
+ *
+ * First, special case the ASCII fold pairs, like 'B' and 'b'.
+ * We do this because we have EXACTFAA at our disposal for the
+ * ASCII range */
+ if (partial_cp_count == 2 && isASCII(start[0])) {
+
+ /* The only ASCII characters that participate in folds are
+ * alphabetics */
+ assert(isALPHA(start[0]));
+ if ( end[0] == start[0] /* First range is a single
+ character, so 2nd exists */
+ && isALPHA_FOLD_EQ(start[0], start[1]))
+ {
+
+ /* Here, is part of an ASCII fold pair */
+
+ if ( ASCII_FOLD_RESTRICTED
+ || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
+ {
+ /* If the second clause just above was true, it
+ * means we can't be under /i, or else the list
+ * would have included more than this fold pair.
+ * Therefore we have to exclude the possibility of
+ * whatever else it is that folds to these, by
+ * using EXACTFAA */
+ op = EXACTFAA;
+ }
+ else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
+
+ /* Here, there's no simple fold that start[0] is part
+ * of, but there is a multi-character one. If we
+ * are not under /i, we want to exclude that
+ * possibility; if under /i, we want to include it
+ * */
+ op = (FOLD) ? EXACTFU : EXACTFAA;
+ }
+ else {
- value = start;
+ /* Here, the only possible fold start[0] particpates in
+ * is with start[1]. /i or not isn't relevant */
+ op = EXACTFU;
+ }
- if (! FOLD) {
- op = (LOC)
- ? EXACTL
- : EXACT;
+ value = toFOLD(start[0]);
+ }
+ }
+ else if ( ! upper_latin1_only_utf8_matches
+ || ( _invlist_len(upper_latin1_only_utf8_matches)
+ == 2
+ && PL_fold_latin1[
+ invlist_highest(upper_latin1_only_utf8_matches)]
+ == start[0]))
+ {
+ /* Here, the smallest character is non-ascii or there are
+ * more than 2 code points matched by this node. Also, we
+ * either don't have /d UTF-8 dependent matches, or if we
+ * do, they look like they could be a single character that
+ * is the fold of the lowest one in the always-match list.
+ * This test quickly excludes most of the false positives
+ * when there are /d UTF-8 depdendent matches. These are
+ * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
+ * SMALL LETTER A WITH GRAVE iff the target string is
+ * UTF-8. (We don't have to worry above about exceeding
+ * the array bounds of PL_fold_latin1[] because any code
+ * point in 'upper_latin1_only_utf8_matches' is below 256.)
+ *
+ * EXACTFAA would apply only to pairs (hence exactly 2 code
+ * points) in the ASCII range, so we can't use it here to
+ * artificially restrict the fold domain, so we check if
+ * the class does or does not match some EXACTFish node.
+ * Further, if we aren't under /i, and and the folded-to
+ * character is part of a multi-character fold, we can't do
+ * this optimization, as the sequence around it could be
+ * that multi-character fold, and we don't here know the
+ * context, so we have to assume it is that multi-char
+ * fold, to prevent potential bugs.
+ *
+ * To do the general case, we first find the fold of the
+ * lowest code point (which may be higher than the lowest
+ * one), then find everything that folds to it. (The data
+ * structure we have only maps from the folded code points,
+ * so we have to do the earlier step.) */
+
+ Size_t foldlen;
+ U8 foldbuf[UTF8_MAXBYTES_CASE];
+ UV folded = _to_uni_fold_flags(start[0],
+ foldbuf, &foldlen, 0);
+ unsigned int first_fold;
+ const unsigned int * remaining_folds;
+ Size_t folds_to_this_cp_count = _inverse_folds(
+ folded,
+ &first_fold,
+ &remaining_folds);
+ Size_t folds_count = folds_to_this_cp_count + 1;
+ SV * fold_list = _new_invlist(folds_count);
+ unsigned int i;
+
+ /* If there are UTF-8 dependent matches, create a temporary
+ * list of what this node matches, including them. */
+ SV * all_cp_list = NULL;
+ SV ** use_this_list = &cp_list;
+
+ if (upper_latin1_only_utf8_matches) {
+ all_cp_list = _new_invlist(0);
+ use_this_list = &all_cp_list;
+ _invlist_union(cp_list,
+ upper_latin1_only_utf8_matches,
+ use_this_list);
}
- else if (LOC) {
- /* A locale node under folding with one code point can
- * be an EXACTFL, as its fold won't be calculated until
- * runtime */
- op = EXACTFL;
+ /* Having gotten everything that participates in the fold
+ * containing the lowest code point, we turn that into an
+ * inversion list, making sure everything is included. */
+ fold_list = add_cp_to_invlist(fold_list, start[0]);
+ fold_list = add_cp_to_invlist(fold_list, folded);
+ fold_list = add_cp_to_invlist(fold_list, first_fold);
+ for (i = 0; i < folds_to_this_cp_count - 1; i++) {
+ fold_list = add_cp_to_invlist(fold_list,
+ remaining_folds[i]);
}
- else {
- /* Here, we are generally folding, but there is only
- * one code point to match. If we have to, we use an
- * EXACT node, but it would be better for joining with
- * adjacent nodes in the optimization phase if we used
- * the same EXACTFish node that any such are likely to
- * be. We can do this iff the code point doesn't
- * participate in any folds. For example, an EXACTF of
- * a colon is the same as an EXACT one, since nothing
- * folds to or from a colon. */
- if (value < 256) {
- if (IS_IN_SOME_FOLD_L1(value)) {
- op = EXACT;
- }
- }
- else {
- if (_invlist_contains_cp(PL_in_some_fold, value)) {
- op = EXACT;
+ /* If the fold list is identical to what's in this ANYOF
+ * node, the node can be represented by an EXACTFish one
+ * instead */
+ if (_invlistEQ(*use_this_list, fold_list,
+ 0 /* Don't complement */ )
+ ) {
+
+ /* But, we have to be careful, as mentioned above.
+ * Just the right sequence of characters could match
+ * this if it is part of a multi-character fold. That
+ * IS what we want if we are under /i. But it ISN'T
+ * what we want if not under /i, as it could match when
+ * it shouldn't. So, when we aren't under /i and this
+ * character participates in a multi-char fold, we
+ * don't optimize into an EXACTFish node. So, for each
+ * case below we have to check if we are folding
+ * and if not, if it is not part of a multi-char fold.
+ * */
+ if (start[0] > 255) { /* Highish code point */
+ if (FOLD || ! _invlist_contains_cp(
+ PL_InMultiCharFold, folded))
+ {
+ op = (LOC)
+ ? EXACTFLU8
+ : (ASCII_FOLD_RESTRICTED)
+ ? EXACTFAA
+ : EXACTFU_ONLY8;
+ value = folded;
}
+ } /* Below, the lowest code point < 256 */
+ else if ( FOLD
+ && folded == 's'
+ && DEPENDS_SEMANTICS)
+ { /* An EXACTF node containing a single character
+ 's', can be an EXACTFU if it doesn't get
+ joined with an adjacent 's' */
+ op = EXACTFU_S_EDGE;
+ value = folded;
}
+ else if ( FOLD
+ || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
+ {
+ if (upper_latin1_only_utf8_matches) {
+ op = EXACTF;
- /* If we haven't found the node type, above, it means
- * we can use the prevailing one */
- if (op == END) {
- op = compute_EXACTish(pRExC_state);
+ /* We can't use the fold, as that only matches
+ * under UTF-8 */
+ value = start[0];
+ }
+ else if ( UNLIKELY(start[0] == MICRO_SIGN)
+ && ! UTF)
+ { /* EXACTFUP is a special node for this
+ character */
+ op = (ASCII_FOLD_RESTRICTED)
+ ? EXACTFAA
+ : EXACTFUP;
+ value = MICRO_SIGN;
+ }
+ else if ( ASCII_FOLD_RESTRICTED
+ && ! isASCII(start[0]))
+ { /* For ASCII under /iaa, we can use EXACTFU
+ below */
+ op = EXACTFAA;
+ value = folded;
+ }
+ else {
+ op = EXACTFU;
+ value = folded;
+ }
}
}
- }
- } /* End of first range contains just a single code point */
- else if (start == 0) {
- if (end == UV_MAX) {
- op = SANY;
- *flagp |= HASWIDTH|SIMPLE;
- MARK_NAUGHTY(1);
- }
- else if (end == '\n' - 1
- && invlist_iternext(cp_list, &start, &end)
- && start == '\n' + 1 && end == UV_MAX)
- {
- op = REG_ANY;
- *flagp |= HASWIDTH|SIMPLE;
- MARK_NAUGHTY(1);
+
+ SvREFCNT_dec_NN(fold_list);
+ SvREFCNT_dec(all_cp_list);
}
}
- invlist_iterfinish(cp_list);
if (op != END) {
- ret = reg_node(pRExC_state, op);
- if (PL_regkind[op] == EXACT) {
- alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
- TRUE /* downgradable to EXACT */
- );
+
+ /* Here, we have calculated what EXACTish node we would use.
+ * But we don't use it if it would require converting the
+ * pattern to UTF-8, unless not using it could cause us to miss
+ * some folds (hence be buggy) */
+
+ if (! UTF && value > 255) {
+ SV * in_multis = NULL;
+
+ assert(FOLD);
+
+ /* If there is no code point that is part of a multi-char
+ * fold, then there aren't any matches, so we don't do this
+ * optimization. Otherwise, it could match depending on
+ * the context around us, so we do upgrade */
+ _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis);
+ if (UNLIKELY(_invlist_len(in_multis) != 0)) {
+ REQUIRE_UTF8(flagp);
+ }
+ else {
+ op = END;
+ }
+ }
+
+ if (op != END) {
+ U8 len = (UTF) ? UVCHR_SKIP(value) : 1;
+
+ ret = regnode_guts(pRExC_state, op, len, "exact");
+ FILL_NODE(ret, op);
+ RExC_emit += 1 + STR_SZ(len);
+ STR_LEN(REGNODE_p(ret)) = len;
+ if (len == 1) {
+ *STRING(REGNODE_p(ret)) = value;
+ }
+ else {
+ uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
+ }
+ goto not_anyof;
}
- goto not_anyof;
}
+ }
- {
+ if (! has_runtime_dependency) {
/* See if this can be turned into an ANYOFM node. Think about the
* bit patterns in two different bytes. In some positions, the
* 0x30. Any other bytes ANDed yield something else. So [01],
* which is a common usage, is optimizable into ANYOFM, and can
* benefit from the speed up. We can only do this on UTF-8
- * invariant bytes, because they don't have the same patterns under
+ * invariant bytes, because they have the same bit patterns under
* UTF-8 as not. */
PERL_UINT_FAST8_T inverted = 0;
#ifdef EBCDIC
}
if (invlist_highest(cp_list) <= max_permissible) {
- UV this_start, this_end, lowest_cp;
+ UV this_start, this_end;
+ UV lowest_cp = UV_MAX; /* inited to suppress compiler warn */
U8 bits_differing = 0;
- Size_t cp_count = 0;
+ Size_t full_cp_count = 0;
bool first_time = TRUE;
/* Go through the bytes and find the bit positions that differ
bits_differing |= i ^ lowest_cp;
}
- cp_count += this_end - this_start + 1;
+ full_cp_count += this_end - this_start + 1;
}
invlist_iterfinish(cp_list);
* a 1 in that position, and another has a 0. But that would
* mean that one of them differs from the lowest code point in
* that position, which possibility we've already excluded. */
- if ( (inverted || cp_count > 1)
- && cp_count == 1U << PL_bitcount[bits_differing])
+ if ( (inverted || full_cp_count > 1)
+ && full_cp_count == 1U << PL_bitcount[bits_differing])
{
U8 ANYOFM_mask;
/* The argument is the lowest code point */
ret = reganode(pRExC_state, op, lowest_cp);
FLAGS(REGNODE_p(ret)) = ANYOFM_mask;
-
- *flagp |= HASWIDTH|SIMPLE;
}
}
done_anyofm:
if (op != END) {
goto not_anyof;
}
- }
+ }
- /* Here, didn't find an optimization. See if this matches any
- * of the POSIX classes. The POSIXA ones are about the same speed
- * as ANYOF ops, but take less room; the ones that have
- * above-Latin1 code point matches are somewhat faster than ANYOF.
- * */
+ if (! posixl) {
+ PERL_UINT_FAST8_T type;
+ SV * intersection = NULL;
+ SV* d_invlist = NULL;
- for (posix_class = 0;
- posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
- posix_class++)
- {
- int try_inverted;
+ /* See if this matches any of the POSIX classes. The POSIXA and
+ * POSIXD ones are about the same speed as ANYOF ops, but take less
+ * room; the ones that have above-Latin1 code point matches are
+ * somewhat faster than ANYOF. */
+
+ for (type = POSIXA; type >= POSIXD; type--) {
+ int posix_class;
- for (try_inverted = 0; try_inverted < 2; try_inverted++)
+ if (type == POSIXL) { /* But not /l posix classes */
+ continue;
+ }
+
+ for (posix_class = 0;
+ posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
+ posix_class++)
{
+ SV** our_code_points = &cp_list;
+ SV** official_code_points;
+ int try_inverted;
+
+ if (type == POSIXA) {
+ official_code_points = &PL_Posix_ptrs[posix_class];
+ }
+ else {
+ official_code_points = &PL_XPosix_ptrs[posix_class];
+ }
+
+ /* Skip non-existent classes of this type. e.g. \v only
+ * has an entry in PL_XPosix_ptrs */
+ if (! *official_code_points) {
+ continue;
+ }
+
+ /* Try both the regular class, and its inversion */
+ for (try_inverted = 0; try_inverted < 2; try_inverted++) {
+ bool this_inverted = invert ^ try_inverted;
+
+ if (type != POSIXD) {
+
+ /* This class that isn't /d can't match if we have
+ * /d dependencies */
+ if (has_runtime_dependency
+ & HAS_D_RUNTIME_DEPENDENCY)
+ {
+ continue;
+ }
+ }
+ else /* is /d */ if (! this_inverted) {
+
+ /* /d classes don't match anything non-ASCII below
+ * 256 unconditionally (which cp_list contains) */
+ _invlist_intersection(cp_list, PL_UpperLatin1,
+ &intersection);
+ if (_invlist_len(intersection) != 0) {
+ continue;
+ }
- /* Check if matches POSIXA, normal or inverted */
- if (PL_Posix_ptrs[posix_class]) {
- if (_invlistEQ(cp_list,
- PL_Posix_ptrs[posix_class],
+ SvREFCNT_dec(d_invlist);
+ d_invlist = invlist_clone(cp_list, NULL);
+
+ /* But under UTF-8 it turns into using /u rules.
+ * Add the things it matches under these conditions
+ * so that we check below that these are identical
+ * to what the tested class should match */
+ if (upper_latin1_only_utf8_matches) {
+ _invlist_union(
+ d_invlist,
+ upper_latin1_only_utf8_matches,
+ &d_invlist);
+ }
+ our_code_points = &d_invlist;
+ }
+ else { /* POSIXD, inverted. If this doesn't have this
+ flag set, it isn't /d. */
+ if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+ {
+ continue;
+ }
+ our_code_points = &cp_list;
+ }
+
+ /* Here, have weeded out some things. We want to see
+ * if the list of characters this node contains
+ * ('*our_code_points') precisely matches those of the
+ * class we are currently checking against
+ * ('*official_code_points'). */
+ if (_invlistEQ(*our_code_points,
+ *official_code_points,
try_inverted))
{
+ /* Here, they precisely match. Optimize this ANYOF
+ * node into its equivalent POSIX one of the
+ * correct type, possibly inverted */
ret = reg_node(pRExC_state, (try_inverted)
- ? NPOSIXA
- : POSIXA);
- FLAGS(REGNODE_p(ret)) = posix_class;
- *flagp |= HASWIDTH|SIMPLE;
+ ? type + NPOSIXA
+ - POSIXA
+ : type);
+ FLAGS(REGNODE_p(ret)) = posix_class;
+ SvREFCNT_dec(d_invlist);
+ SvREFCNT_dec(intersection);
goto not_anyof;
}
}
-
- /* Check if matches POSIXU, normal or inverted */
- if (_invlistEQ(cp_list,
- PL_XPosix_ptrs[posix_class],
- try_inverted))
- {
- ret = reg_node(pRExC_state, (try_inverted)
- ? NPOSIXU
- : POSIXU);
-
- FLAGS(REGNODE_p(ret)) = posix_class;
- *flagp |= HASWIDTH|SIMPLE;
- goto not_anyof;
- }
}
}
+ SvREFCNT_dec(d_invlist);
+ SvREFCNT_dec(intersection);
+ }
+
+ /* If didn't find an optimization and there is no need for a
+ * bitmap, optimize to indicate that */
+ if ( start[0] >= NUM_ANYOF_CODE_POINTS
+ && ! LOC
+ && ! upper_latin1_only_utf8_matches)
+ {
+ op = ANYOFH;
}
} /* End of seeing if can optimize it into a different node */
- /* It's going to be an ANYOF node. */
- op = (use_anyofd)
- ? ANYOFD
- : ((posixl)
- ? ANYOFPOSIXL
- : ((LOC)
- ? ANYOFL
- : ANYOF));
+ is_anyof: /* It's going to be an ANYOF node. */
+ if (op != ANYOFH) {
+ op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
+ ? ANYOFD
+ : ((posixl)
+ ? ANYOFPOSIXL
+ : ((LOC)
+ ? ANYOFL
+ : ANYOF));
+ }
+
ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
FILL_NODE(ret, op); /* We set the argument later */
RExC_emit += 1 + regarglen[op];
(HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
? listsv : NULL,
only_utf8_locale_list,
- swash, has_user_defined_property);
-
- *flagp |= HASWIDTH|SIMPLE;
-
- if (ANYOF_FLAGS(REGNODE_p(ret)) & ANYOF_LOCALE_FLAGS) {
- RExC_contains_locale = 1;
- }
-
+ swash, cBOOL(has_runtime_dependency
+ & HAS_USER_DEFINED_PROPERTY));
return ret;
not_anyof:
Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
RExC_parse - orig_parse);;
- SvREFCNT_dec_NN(cp_list);;
+ SvREFCNT_dec(cp_list);;
return ret;
}
STATIC void
S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
{
+ /* 'size' is the delta to add or subtract from the current memory allocated
+ * to the regex engine being constructed */
+
PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
RExC_size += size;
DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n",
- SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(REGNODE_p(scan)),
+ SvPV_nolen_const(RExC_mysv), scan,
(temp == NULL ? "->" : ""),
(temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "")
);
regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state);
Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n",
SvPV_nolen_const(RExC_mysv),
- REG_NODE_NUM(REGNODE_p(scan)),
+ scan,
PL_reg_name[exact]);
});
if (temp == NULL)
Perl_re_printf( aTHX_
"~ attach to %s (%" IVdf ") offset to %" IVdf "\n",
SvPV_nolen_const(RExC_mysv),
- (IV)REG_NODE_NUM(REGNODE_p(val)),
+ (IV)val,
(IV)(val - scan)
);
});
/* Ready to start outputting. First, the initial left bracket */
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
- /* Then all the things that could fit in the bitmap */
- do_sep = put_charclass_bitmap_innards(sv,
- ANYOF_BITMAP(o),
- bitmap_range_not_in_bitmap,
- only_utf8_locale_invlist,
- o,
-
- /* Can't try inverting for a
- * better display if there are
- * things that haven't been
- * resolved */
- unresolved != NULL);
- SvREFCNT_dec(bitmap_range_not_in_bitmap);
-
- /* If there are user-defined properties which haven't been defined yet,
- * output them. If the result is not to be inverted, it is clearest to
- * output them in a separate [] from the bitmap range stuff. If the
- * result is to be complemented, we have to show everything in one [],
- * as the inversion applies to the whole thing. Use {braces} to
- * separate them from anything in the bitmap and anything above the
- * bitmap. */
- if (unresolved) {
- if (inverted) {
- if (! do_sep) { /* If didn't output anything in the bitmap */
- sv_catpvs(sv, "^");
+ if (OP(o) != ANYOFH) {
+ /* Then all the things that could fit in the bitmap */
+ do_sep = put_charclass_bitmap_innards(sv,
+ ANYOF_BITMAP(o),
+ bitmap_range_not_in_bitmap,
+ only_utf8_locale_invlist,
+ o,
+
+ /* Can't try inverting for a
+ * better display if there
+ * are things that haven't
+ * been resolved */
+ unresolved != NULL);
+ SvREFCNT_dec(bitmap_range_not_in_bitmap);
+
+ /* If there are user-defined properties which haven't been defined
+ * yet, output them. If the result is not to be inverted, it is
+ * clearest to output them in a separate [] from the bitmap range
+ * stuff. If the result is to be complemented, we have to show
+ * everything in one [], as the inversion applies to the whole
+ * thing. Use {braces} to separate them from anything in the
+ * bitmap and anything above the bitmap. */
+ if (unresolved) {
+ if (inverted) {
+ if (! do_sep) { /* If didn't output anything in the bitmap
+ */
+ sv_catpvs(sv, "^");
+ }
+ sv_catpvs(sv, "{");
}
- sv_catpvs(sv, "{");
- }
- else if (do_sep) {
- Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]);
- }
- sv_catsv(sv, unresolved);
- if (inverted) {
- sv_catpvs(sv, "}");
+ else if (do_sep) {
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1],
+ PL_colors[0]);
+ }
+ sv_catsv(sv, unresolved);
+ if (inverted) {
+ sv_catpvs(sv, "}");
+ }
+ do_sep = ! inverted;
}
- do_sep = ! inverted;
}
/* And, finally, add the above-the-bitmap stuff */
PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
UNI__PERL_FOLDS_TO_MULTI_CHAR]);
- PL_InMultiCharFold = _new_invlist_C_array(UNI__PERL_IS_IN_MULTI_CHAR_FOLD_invlist);
- PL_NonFinalFold = _new_invlist_C_array(UNI__PERL_NON_FINAL_FOLDS_invlist);
+ PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
+ UNI__PERL_IS_IN_MULTI_CHAR_FOLD]);
+ PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[
+ UNI__PERL_NON_FINAL_FOLDS]);
PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist);
PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist);