if ( noper_trietype
&&
(
- ( noper_trietype == NOTHING )
- ||
- ( trietype == NOTHING )
- ||
- ( trietype == noper_trietype )
+ /* XXX: Currently we cannot allow a NOTHING node to be the first element
+ * of a TRIEABLE sequence, Otherwise we will overwrite the regop following
+ * the NOTHING with the TRIE regop later on. This is because a NOTHING node
+ * is only one regnode wide, and a TRIE is two regnodes. An example of a
+ * problematic pattern is: "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/
+ * At a later point of time we can somewhat workaround this by handling
+ * NOTHING -> EXACT sequences as generated by /(?:)A|(?:)B/ type patterns,
+ * as we can effectively ignore the NOTHING regop in that case.
+ * This clause, which allows NOTHING to start a sequence is left commented
+ * out as a reference.
+ * - Yves
+
+ ( noper_trietype == NOTHING)
+ || ( trietype == NOTHING )
+ */
+ ( noper_trietype == NOTHING && trietype )
+ || ( trietype == noper_trietype )
)
#ifdef NOJUMPTRIE
&& noper_next == tail
/* handle unmergable node -
* noper may either be a triable node which can not be tried
* together with the current trie, or a non triable node */
- if ( last && trietype != NOTHING ) {
- /* if last is set then we have found at least two triable branch
- * sequences in a row of a similar trietype so we can turn them
- * into a trie */
- make_trie( pRExC_state,
- startbranch, first, cur, tail, count,
- trietype, depth+1 );
+ if ( last ) {
+ /* If last is set and trietype is not NOTHING then we have found
+ * at least two triable branch sequences in a row of a similar
+ * trietype so we can turn them into a trie. If/when we
+ * allow NOTHING to start a trie sequence this condition will be
+ * required, and it isn't expensive so we leave it in for now. */
+ if ( trietype != NOTHING )
+ make_trie( pRExC_state,
+ startbranch, first, cur, tail, count,
+ trietype, depth+1 );
last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
}
if ( noper_trietype
UV uc;
if (UTF) {
const U8 * const s = (U8*)STRING(scan);
+ uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
- uc = utf8_to_uvchr(s, NULL);
} else {
uc = *((U8*)STRING(scan));
}
}
if (UTF) {
const U8 * const s = (U8 *)STRING(scan);
+ uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
- uc = utf8_to_uvchr(s, NULL);
}
else if (has_exactf_sharp_s) {
RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
for (foldbuf = tmpbuf;
foldlen;
foldlen -= numlen) {
- ender = utf8_to_uvchr(foldbuf, &numlen);
+
+ /* tmpbuf has been constructed by us, so we
+ * know it is valid utf8 */
+ ender = valid_utf8_to_uvchr(foldbuf, &numlen);
if (numlen > 0) {
const STRLEN unilen = reguni(pRExC_state, ender, s);
s += unilen;
for (foldbuf = tmpbuf;
foldlen;
foldlen -= numlen) {
- ender = utf8_to_uvchr(foldbuf, &numlen);
+ ender = valid_utf8_to_uvchr(foldbuf, &numlen);
if (numlen > 0) {
const STRLEN unilen = reguni(pRExC_state, ender, s);
len += unilen;