case CANY:
scan = loceol;
break;
- case EXACT: /* length of string is 1 */
+ case EXACT:
+ /* To get here, EXACT nodes must have *byte* length == 1. That means
+ * they match only characters in the string that can be expressed as a
+ * single byte. For non-utf8 strings, that means a simple match. For
+ * utf8 strings, the character matched must be an invariant, or
+ * downgradable to a single byte. The pattern's utf8ness is
+ * irrelevant, as it must be a single byte, so either it isn't utf8, or
+ * if it is it's an invariant */
+
c = (U8)*STRING(p);
- while (scan < loceol && UCHARAT(scan) == c)
- scan++;
+ assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+ if ((! utf8_target) || UNI_IS_INVARIANT(c)) {
+
+ /* Here, the string isn't utf8, or the character in the EXACT
+ * node is the same in utf8 as not, so can just do equality.
+ * Each matching char must be 1 byte long */
+ while (scan < loceol && UCHARAT(scan) == c) {
+ scan++;
+ }
+ }
+ else {
+
+ /* Here, the string is utf8, and the char to match is different
+ * in utf8 than not. Fastest to find the two utf8 bytes that
+ * represent c, and then look for those in sequence in the utf8
+ * string */
+ U8 high = UTF8_TWO_BYTE_HI(c);
+ U8 low = UTF8_TWO_BYTE_LO(c);
+ loceol = PL_regeol;
+ while (hardcount < max
+ && scan + 1 < loceol
+ && UCHARAT(scan) == high
+ && UCHARAT(scan + 1) == low)
+ {
+ scan += 2;
+ hardcount++;
+ }
+ }
break;
case EXACTF: /* length of string is 1 */
c = (U8)*STRING(p);
}
-plan tests => 398; # Update this when adding/deleting tests.
+plan tests => 402; # Update this when adding/deleting tests.
run_tests() unless caller;
}
+ { # Some constructs with Latin1 characters cause a utf8 string not to
+ # match itself in non-utf8
+ my $c = "\xc0";
+ my $pattern = my $utf8_pattern = qr/((\xc0)+,?)/;
+ utf8::upgrade($utf8_pattern);
+ ok $c =~ $pattern, "\\xc0 =~ $pattern; Neither pattern nor target utf8";
+ ok $c =~ $utf8_pattern, "\\xc0 =~ $pattern; pattern utf8, target not";
+ utf8::upgrade($c);
+ ok $c =~ $pattern, "\\xc0 =~ $pattern; target utf8, pattern not";
+ ok $c =~ $utf8_pattern, "\\xc0 =~ $pattern; Both target and pattern utf8";
+ }
+
{
# Test that a regex followed by an operator and/or a statement modifier work
# These tests use string-eval so that it reports a clean error when it fails