switch (OP(c)) {
case ANYOF:
if (utf8_target) {
- REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
+ REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) ||
!UTF8_IS_INVARIANT((U8)s[0]) ?
reginclass(prog, c, (U8*)s, 0, utf8_target) :
REGINCLASS(prog, c, (U8*)s));
PerlIO_printf( Perl_debug_log,
" Scanning for legal start char...\n");
}
- );
- while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
- uc++;
- }
+ );
+ if (utf8_target) {
+ while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+ uc += UTF8SKIP(uc);
+ }
+ } else {
+ while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
+ uc++;
+ }
+ }
s= (char *)uc;
}
if (uc >(U8*)last_start) break;
"%*s %smatched empty string...%s\n",
REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
);
- break;
+ if (!trie->jump)
+ break;
} else {
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
case CANY:
scan = loceol;
break;
+ case EXACT:
+ /* To get here, EXACTish nodes must have *byte* length == 1. That
+ * means they match only characters in the string that can be expressed
+ * as a single byte. For non-utf8 strings, that means a simple match.
+ * For utf8 strings, the character matched must be an invariant, or
+ * downgradable to a single byte. The pattern's utf8ness is
+ * irrelevant, as since it's a single byte, it either isn't utf8, or if
+ * it is, it's an invariant */
+
+ c = (U8)*STRING(p);
+ assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
+
+ if (! utf8_target || UNI_IS_INVARIANT(c)) {
+ while (scan < loceol && UCHARAT(scan) == c) {
+ scan++;
+ }
+ }
+ else {
+
+ /* Here, the string is utf8, and the pattern char is different
+ * in utf8 than not, so can't compare them directly. Outside the
+ * loop, find find the two utf8 bytes that represent c, and then
+ * look for those in sequence in the utf8 string */
+ U8 high = UTF8_TWO_BYTE_HI(c);
+ U8 low = UTF8_TWO_BYTE_LO(c);
+ loceol = PL_regeol;
+
+ while (hardcount < max
+ && scan + 1 < loceol
+ && UCHARAT(scan) == high
+ && UCHARAT(scan + 1) == low)
+ {
+ scan += 2;
+ hardcount++;
+ }
+ }
+ break;
case EXACTFL:
PL_reg_flags |= RF_tainted;
/* FALL THROUGH */
- case EXACT:
case EXACTF:
- /* To get here, EXACTish nodes must have *byte* length == 1. That means
- * they match only characters in the string that can be expressed as a
- * single byte. For non-utf8 strings, that means a simple match. For
- * utf8 strings, the character matched must be an invariant, or
- * downgradable to a single byte. The pattern's utf8ness is
- * irrelevant, as it must be a single byte, so either it isn't utf8, or
- * if it is it's an invariant */
+
+ /* The comments for the EXACT case apply as well to these fold ones */
c = (U8)*STRING(p);
assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
- if ((! utf8_target) || UNI_IS_INVARIANT(c)) {
+ if (utf8_target) { /* Use full Unicode fold matching */
+
+ /* For the EXACTFL case, It doesn't really make sense to compare
+ * locale and utf8, but it is best we can do. The documents warn
+ * against mixing them */
- /* Here, the string isn't utf8, or the character in the EXACT
- * node is the same in utf8 as not, so can just do equality.
- * Each matching char must be 1 byte long */
+ char *tmpeol = loceol;
+ while (hardcount < max
+ && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
+ STRING(p), NULL, 1, UTF_PATTERN))
+ {
+ scan = tmpeol;
+ tmpeol = loceol;
+ hardcount++;
+ }
+
+ /* XXX Note that the above handles properly the German sharp s in
+ * the pattern matching ss in the string. But it doesn't handle
+ * properly cases where the string contains say 'LIGATURE ff' and
+ * the pattern is 'f+'. This would require, say, a new function or
+ * revised interface to foldEQ_utf8(), in which the maximum number
+ * of characters to match could be passed and it would return how
+ * many actually did. This is just one of many cases where
+ * multi-char folds don't work properly, and so the fix is being
+ * deferred */
+ }
+ else {
+
+ /* Here, the string isn't utf8; and either the pattern isn't utf8
+ * or c is an invariant, so its utf8ness doesn't affect c. Can
+ * just do simple comparisons for exact or fold matching. */
switch (OP(p)) {
- case EXACT:
- while (scan < loceol && UCHARAT(scan) == c) {
- scan++;
- }
- break;
case EXACTF:
while (scan < loceol &&
(UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
}
}
- else {
-
- /* Here, the string is utf8, and the pattern char is different
- * in utf8 than not. */
-
- switch (OP(p)) {
- case EXACT:
- {
- /* Fastest to find the two utf8 bytes that represent c, and
- * then look for those in sequence in the utf8 string */
- U8 high = UTF8_TWO_BYTE_HI(c);
- U8 low = UTF8_TWO_BYTE_LO(c);
- loceol = PL_regeol;
-
- while (hardcount < max
- && scan + 1 < loceol
- && UCHARAT(scan) == high
- && UCHARAT(scan + 1) == low)
- {
- scan += 2;
- hardcount++;
- }
- }
- break;
- case EXACTFL: /* Doesn't really make sense, but is best we can
- do. The documents warn against mixing locale
- and utf8 */
- case EXACTF:
- { /* utf8 string, so use utf8 foldEQ */
- char *tmpeol = loceol;
- while (hardcount < max
- && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
- STRING(p), NULL, 1, UTF_PATTERN))
- {
- scan = tmpeol;
- tmpeol = loceol;
- hardcount++;
- }
-
- /* XXX Note that the above handles properly the German
- * sharp ss in the pattern matching ss in the string. But
- * it doesn't handle properly cases where the string
- * contains say 'LIGATURE ff' and the pattern is 'f+'.
- * This would require, say, a new function or revised
- * interface to foldEQ_utf8(), in which the maximum number
- * of characters to match could be passed and it would
- * return how many actually did. This is just one of many
- * cases where multi-char folds don't work properly, and so
- * the fix is being deferred */
- }
- break;
- default:
- Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
- }
- }
break;
case ANYOF:
if (utf8_target) {
maxlen = c_len;
}
- if (utf8_target || (flags & ANYOF_UNICODE)) {
- if (utf8_target && !ANYOF_RUNTIME(n)) {
- if (c_len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
- match = TRUE;
- }
- if (!match && utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256)
- match = TRUE;
- if (!match) {
- AV *av;
- SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
-
- if (sw) {
- U8 * utf8_p;
- if (utf8_target) {
- utf8_p = (U8 *) p;
- } else {
- STRLEN len = 1;
- utf8_p = bytes_to_utf8(p, &len);
- }
- if (swash_fetch(sw, utf8_p, 1))
- match = TRUE;
- else if (flags & ANYOF_FOLD) {
- if (!match && lenp && av) {
- I32 i;
- for (i = 0; i <= av_len(av); i++) {
- SV* const sv = *av_fetch(av, i, FALSE);
- STRLEN len;
- const char * const s = SvPV_const(sv, len);
- if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) {
- *lenp = len;
- match = TRUE;
- break;
- }
- }
- }
- if (!match) {
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
-
- STRLEN tmplen;
- to_utf8_fold(utf8_p, tmpbuf, &tmplen);
- if (swash_fetch(sw, tmpbuf, 1))
- match = TRUE;
- }
- }
-
- /* If we allocated a string above, free it */
- if (! utf8_target) Safefree(utf8_p);
- }
- }
- }
- if (!match && c < 256) {
+ /* If this character is potentially in the bitmap, check it */
+ if (c < 256) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
else if (flags & ANYOF_FOLD) {
match = TRUE;
}
- if (!match && (flags & ANYOF_CLASS)) {
+ if (!match && (flags & ANYOF_CLASS) && ANYOF_CLASS_TEST_ANY_SET(n)) {
PL_reg_flags |= RF_tainted;
if (
(ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
}
}
+ /* If the bitmap didn't (or couldn't) match, and something outside the
+ * bitmap could match, try that */
+ if (!match) {
+ if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
+ match = TRUE;
+ }
+ else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
+ || (utf8_target && flags & ANYOF_UTF8))
+ {
+ AV *av;
+ SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
+
+ if (sw) {
+ U8 * utf8_p;
+ if (utf8_target) {
+ utf8_p = (U8 *) p;
+ } else {
+ STRLEN len = 1;
+ utf8_p = bytes_to_utf8(p, &len);
+ }
+ if (swash_fetch(sw, utf8_p, 1))
+ match = TRUE;
+ else if (flags & ANYOF_FOLD) {
+ if (!match && lenp && av) {
+ I32 i;
+ for (i = 0; i <= av_len(av); i++) {
+ SV* const sv = *av_fetch(av, i, FALSE);
+ STRLEN len;
+ const char * const s = SvPV_const(sv, len);
+ if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) {
+ *lenp = len;
+ match = TRUE;
+ break;
+ }
+ }
+ }
+ if (!match) {
+ U8 folded[UTF8_MAXBYTES_CASE+1];
+
+ /* See if the folded version matches */
+ STRLEN foldlen;
+ to_utf8_fold(utf8_p, folded, &foldlen);
+ if (swash_fetch(sw, folded, 1)) { /* 1 => is utf8 */
+ match = TRUE;
+ }
+ else {
+ SV** listp;
+
+ /* Consider "k" =~ /[K]/i. The line above would
+ * have just folded the 'k' to itself, and that
+ * isn't going to match 'K'. So we look through
+ * the closure of everything that folds to 'k'.
+ * That will find the 'K'. Initialize the list, if
+ * necessary */
+ if (! PL_utf8_foldclosures) {
+
+ /* If the folds haven't been read in, call a
+ * fold function to force that */
+ if (! PL_utf8_tofold) {
+ U8 dummy[UTF8_MAXBYTES+1];
+ STRLEN dummy_len;
+ to_utf8_fold((U8*) "A", dummy, &dummy_len);
+ }
+ PL_utf8_foldclosures =
+ _swash_inversion_hash(PL_utf8_tofold);
+ }
+
+ /* The data structure is a hash with the keys every
+ * character that is folded to, like 'k', and the
+ * values each an array of everything that folds to
+ * its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) folded, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV i;
+ for (i = 0; i <= av_len(list); i++) {
+ SV** try_p = av_fetch(list, i, FALSE);
+ if (try_p == NULL) {
+ Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+ }
+ /* Don't have to worry about embeded nulls
+ * since NULL isn't folded or foldable */
+ if (swash_fetch(sw, (U8*) SvPVX(*try_p),1)) {
+ match = TRUE;
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /* If we allocated a string above, free it */
+ if (! utf8_target) Safefree(utf8_p);
+ }
+ }
+ }
+
return (flags & ANYOF_INVERT) ? !match : match;
}