= "Can't match, because target string needs to be in UTF-8\n";
#endif
+/* Returns a boolean as to whether the input unsigned number is a power of 2
+ * (2**0, 2**1, etc). In other words if it has just a single bit set.
+ * If not, subtracting 1 would leave the uppermost bit set, so the & would
+ * yield non-zero */
+#define isPOWER_OF_2(n) ((n & (n-1)) == 0)
+
#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\
goto target; \
return FALSE; /* Things like CNTRL are always below 256 */
}
+STATIC char *
+S_find_next_ascii(char * s, const char * send, const bool utf8_target)
+{
+ /* Returns the position of the first ASCII byte in the sequence between 's'
+ * and 'send-1' inclusive; returns 'send' if none found */
+
+ PERL_ARGS_ASSERT_FIND_NEXT_ASCII;
+
+#ifndef EBCDIC
+
+ if ((STRLEN) (send - s) >= PERL_WORDSIZE
+
+ /* This term is wordsize if subword; 0 if not */
+ + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s)
+
+ /* 'offset' */
+ - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK))
+ {
+
+ /* Process per-byte until reach word boundary. XXX This loop could be
+ * eliminated if we knew that this platform had fast unaligned reads */
+ while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) {
+ if (isASCII(*s)) {
+ return s;
+ }
+ s++; /* khw didn't bother creating a separate loop for
+ utf8_target */
+ }
+
+ /* Here, we know we have at least one full word to process. Process
+ * per-word as long as we have at least a full word left */
+ do {
+ if ((* (PERL_UINTMAX_T *) s) & ~ PERL_VARIANTS_WORD_MASK) {
+ break;
+ }
+ s += PERL_WORDSIZE;
+ } while (s + PERL_WORDSIZE <= send);
+ }
+
+#endif
+
+ /* Process per-character */
+ if (utf8_target) {
+ while (s < send) {
+ if (isASCII(*s)) {
+ return s;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+ else {
+ while (s < send) {
+ if (isASCII(*s)) {
+ return s;
+ }
+ s++;
+ }
+ }
+
+ return s;
+}
+
+STATIC char *
+S_find_next_non_ascii(char * s, const char * send, const bool utf8_target)
+{
+ /* Returns the position of the first non-ASCII byte in the sequence between
+ * 's' and 'send-1' inclusive; returns 'send' if none found */
+
+#ifdef EBCDIC
+
+ PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
+
+ if (utf8_target) {
+ while (s < send) {
+ if ( ! isASCII(*s)) {
+ return s;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+ else {
+ while (s < send) {
+ if ( ! isASCII(*s)) {
+ return s;
+ }
+ s++;
+ }
+ }
+
+ return s;
+
+#else
+
+ const U8 * next_non_ascii = NULL;
+
+ PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII;
+ PERL_UNUSED_ARG(utf8_target);
+
+ /* On ASCII platforms invariants and ASCII are identical, so if the string
+ * is entirely invariants, there is no non-ASCII character */
+ return (is_utf8_invariant_string_loc((U8 *) s,
+ (STRLEN) (send - s),
+ &next_non_ascii))
+ ? (char *) send
+ : (char *) next_non_ascii;
+
+#endif
+
+}
+
/*
* pregexec and friends
*/
);
break;
+ case ASCII:
+ s = find_next_ascii(s, strend, utf8_target);
+ if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
+ goto got_it;
+ }
+
+ break;
+
+ case NASCII:
+ s = find_next_non_ascii(s, strend, utf8_target);
+ if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) {
+ goto got_it;
+ }
+
+ break;
+
/* The argument to all the POSIX node types is the class number to pass to
* _generic_isCC() to build a mask for searching in PL_charclass[] */
}
to_complement = 1;
- /* FALLTHROUGH */
+ goto posixa;
case POSIXA:
- posixa:
/* Don't need to worry about utf8, as it can match only a single
- * byte invariant character. */
+ * byte invariant character. But we do anyway for performance reasons,
+ * as otherwise we would have to examine all the continuation
+ * characters */
+ if (utf8_target) {
+ REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c)));
+ break;
+ }
+
+ posixa:
REXEC_FBC_CLASS_SCAN(
to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
break;
}));
while (scan != NULL) {
-
-
next = scan + NEXT_OFF(scan);
if (next == scan)
next = NULL;
}
break;
+ case ASCII:
+ if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) {
+ sayNO;
+ }
+
+ locinput++; /* ASCII is always single byte */
+ break;
+
+ case NASCII:
+ if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) {
+ sayNO;
+ }
+
+ goto increment_locinput;
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number
* */
}
else { /* Not utf8_target */
if (ST.c1 == ST.c2) {
- while (locinput <= ST.maxpos &&
- UCHARAT(locinput) != ST.c1)
- locinput++;
- }
- else {
- while (locinput <= ST.maxpos
- && UCHARAT(locinput) != ST.c1
- && UCHARAT(locinput) != ST.c2)
- locinput++;
+ locinput = (char *) memchr(locinput,
+ ST.c1,
+ ST.maxpos + 1 - locinput);
+ if (! locinput) {
+ locinput = ST.maxpos + 1;
+ }
}
+ else {
+ U8 c1_c2_bits_differing = ST.c1 ^ ST.c2;
+
+ if (! isPOWER_OF_2(c1_c2_bits_differing)) {
+ while ( locinput <= ST.maxpos
+ && UCHARAT(locinput) != ST.c1
+ && UCHARAT(locinput) != ST.c2)
+ {
+ locinput++;
+ }
+ }
+ else {
+ /* If c1 and c2 only differ by a single bit, we can
+ * avoid a conditional each time through the loop,
+ * at the expense of a little preliminary setup and
+ * an extra mask each iteration. By masking out
+ * that bit, we match exactly two characters, c1
+ * and c2, and so we don't have to test for both.
+ * On both ASCII and EBCDIC platforms, most of the
+ * ASCII-range and Latin1-range folded equivalents
+ * differ only in a single bit, so this is actually
+ * the most common case. (e.g. 'A' 0x41 vs 'a'
+ * 0x61). */
+ U8 c1_masked = ST.c1 &~ c1_c2_bits_differing;
+ U8 c1_c2_mask = ~ c1_c2_bits_differing;
+ while ( locinput <= ST.maxpos
+ && (UCHARAT(locinput) & c1_c2_mask)
+ != c1_masked)
+ {
+ locinput++;
+ }
+ }
+ }
n = locinput - ST.oldloc;
}
if (locinput > ST.maxpos)
}
}
else {
- while (scan < loceol &&
- (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
- {
- scan++;
+ /* See comments in regmatch() CURLY_B_min_known_fail. We avoid
+ * a conditional each time through the loop if the characters
+ * differ only in a single bit, as is the usual situation */
+ U8 c1_c2_bits_differing = c1 ^ c2;
+
+ if (isPOWER_OF_2(c1_c2_bits_differing)) {
+ U8 c1_masked = c1 & ~ c1_c2_bits_differing;
+ U8 c1_c2_mask = ~ c1_c2_bits_differing;
+
+ while ( scan < loceol
+ && (UCHARAT(scan) & c1_c2_mask) == c1_masked)
+ {
+ scan++;
+ }
+ }
+ else {
+ while ( scan < loceol
+ && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
+ {
+ scan++;
+ }
}
}
}
}
break;
+ case ASCII:
+ if (utf8_target && loceol - scan > max) {
+
+ /* We didn't adjust <loceol> at the beginning of this routine
+ * because is UTF-8, but it is actually ok to do so, since here, to
+ * match, 1 char == 1 byte. */
+ loceol = scan + max;
+ }
+
+ scan = find_next_non_ascii(scan, loceol, utf8_target);
+ break;
+
+ case NASCII:
+ if (utf8_target) {
+ while ( hardcount < max
+ && scan < loceol
+ && ! isASCII_utf8_safe(scan, loceol))
+ {
+ scan += UTF8SKIP(scan);
+ hardcount++;
+ }
+ }
+ else {
+ scan = find_next_ascii(scan, loceol, utf8_target);
+ }
+ break;
+
/* The argument (FLAGS) to all the POSIX node types is the class number */
case NPOSIXL: