case CANY:
scan = loceol;
break;
+ case EXACTFL:
+ PL_reg_flags |= RF_tainted;
+ /* FALL THROUGH */
case EXACT:
- /* To get here, EXACT nodes must have *byte* length == 1. That means
+ case EXACTF:
+ /* To get here, EXACTish nodes must have *byte* length == 1. That means
* they match only characters in the string that can be expressed as a
* single byte. For non-utf8 strings, that means a simple match. For
* utf8 strings, the character matched must be an invariant, or
c = (U8)*STRING(p);
assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+
if ((! utf8_target) || UNI_IS_INVARIANT(c)) {
/* Here, the string isn't utf8, or the character in the EXACT
* node is the same in utf8 as not, so can just do equality.
* Each matching char must be 1 byte long */
- while (scan < loceol && UCHARAT(scan) == c) {
- scan++;
+ switch (OP(p)) {
+ case EXACT:
+ while (scan < loceol && UCHARAT(scan) == c) {
+ scan++;
+ }
+ break;
+ case EXACTF:
+ while (scan < loceol &&
+ (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
+ {
+ scan++;
+ }
+ break;
+ case EXACTFL:
+ while (scan < loceol &&
+ (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
+ {
+ scan++;
+ }
+ break;
+ default:
+ Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
}
}
else {
- /* Here, the string is utf8, and the char to match is different
- * in utf8 than not. Fastest to find the two utf8 bytes that
- * represent c, and then look for those in sequence in the utf8
- * string */
- U8 high = UTF8_TWO_BYTE_HI(c);
- U8 low = UTF8_TWO_BYTE_LO(c);
- loceol = PL_regeol;
- while (hardcount < max
- && scan + 1 < loceol
- && UCHARAT(scan) == high
- && UCHARAT(scan + 1) == low)
- {
- scan += 2;
- hardcount++;
+ /* Here, the string is utf8, and the pattern char is different
+ * in utf8 than not. */
+
+ switch (OP(p)) {
+ case EXACT:
+ {
+ /* Fastest to find the two utf8 bytes that represent c, and
+ * then look for those in sequence in the utf8 string */
+ U8 high = UTF8_TWO_BYTE_HI(c);
+ U8 low = UTF8_TWO_BYTE_LO(c);
+ loceol = PL_regeol;
+
+ while (hardcount < max
+ && scan + 1 < loceol
+ && UCHARAT(scan) == high
+ && UCHARAT(scan + 1) == low)
+ {
+ scan += 2;
+ hardcount++;
+ }
+ }
+ break;
+ case EXACTFL: /* Doesn't really make sense, but is best we can
+ do. The documents warn against mixing locale
+ and utf8 */
+ case EXACTF:
+ { /* utf8 string, so use utf8 foldEQ */
+ char *tmpeol = loceol;
+ while (hardcount < max
+ && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
+ STRING(p), NULL, 1, UTF_PATTERN))
+ {
+ scan = tmpeol;
+ tmpeol = loceol;
+ hardcount++;
+ }
+
+ /* XXX Note that the above handles properly the German
+ * sharp ss in the pattern matching ss in the string. But
+ * it doesn't handle properly cases where the string
+ * contains say 'LIGATURE ff' and the pattern is 'f+'.
+ * This would require, say, a new function or revised
+ * interface to foldEQ_utf8(), in which the maximum number
+ * of characters to match could be passed and it would
+ * return how many actually did. This is just one of many
+ * cases where multi-char folds don't work properly, and so
+ * the fix is being deferred */
+ }
+ break;
+ default:
+ Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
}
}
break;
- case EXACTF: /* length of string is 1 */
- c = (U8)*STRING(p);
- while (scan < loceol &&
- (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
- scan++;
- break;
- case EXACTFL: /* length of string is 1 */
- PL_reg_flags |= RF_tainted;
- c = (U8)*STRING(p);
- while (scan < loceol &&
- (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
- scan++;
- break;
case ANYOF:
if (utf8_target) {
loceol = PL_regeol;
my $pattern = my $utf8_pattern = qr/((\xc0)+,?)/;
utf8::upgrade($utf8_pattern);
ok $c =~ $pattern, "\\xc0 =~ $pattern; Neither pattern nor target utf8";
+ ok $c =~ /$pattern/i, "\\xc0 =~ /$pattern/i; Neither pattern nor target utf8";
ok $c =~ $utf8_pattern, "\\xc0 =~ $pattern; pattern utf8, target not";
+ ok $c =~ /$utf8_pattern/i, "\\xc0 =~ /$pattern/i; pattern utf8, target not";
utf8::upgrade($c);
ok $c =~ $pattern, "\\xc0 =~ $pattern; target utf8, pattern not";
+ ok $c =~ /$pattern/i, "\\xc0 =~ /$pattern/i; target utf8, pattern not";
ok $c =~ $utf8_pattern, "\\xc0 =~ $pattern; Both target and pattern utf8";
+ ok $c =~ /$utf8_pattern/i, "\\xc0 =~ /$pattern/i; Both target and pattern utf8";
}
{