X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/6d351bf2e060da9a1c13a1f7c2deb014f74fe6b8..1f933721fdfb02e4d62d10a1a69032d38d4db05d:/regexec.c diff --git a/regexec.c b/regexec.c index d95b27a..375d4fd 100644 --- a/regexec.c +++ b/regexec.c @@ -1359,7 +1359,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, switch (OP(c)) { case ANYOF: if (utf8_target) { - REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) || + REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) || !UTF8_IS_INVARIANT((U8)s[0]) ? reginclass(prog, c, (U8*)s, 0, utf8_target) : REGINCLASS(prog, c, (U8*)s)); @@ -5761,33 +5761,85 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) case CANY: scan = loceol; break; + case EXACT: + /* To get here, EXACTish nodes must have *byte* length == 1. That + * means they match only characters in the string that can be expressed + * as a single byte. For non-utf8 strings, that means a simple match. + * For utf8 strings, the character matched must be an invariant, or + * downgradable to a single byte. The pattern's utf8ness is + * irrelevant, as since it's a single byte, it either isn't utf8, or if + * it is, it's an invariant */ + + c = (U8)*STRING(p); + assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); + + if (! utf8_target || UNI_IS_INVARIANT(c)) { + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } + else { + + /* Here, the string is utf8, and the pattern char is different + * in utf8 than not, so can't compare them directly. Outside the + * loop, find find the two utf8 bytes that represent c, and then + * look for those in sequence in the utf8 string */ + U8 high = UTF8_TWO_BYTE_HI(c); + U8 low = UTF8_TWO_BYTE_LO(c); + loceol = PL_regeol; + + while (hardcount < max + && scan + 1 < loceol + && UCHARAT(scan) == high + && UCHARAT(scan + 1) == low) + { + scan += 2; + hardcount++; + } + } + break; case EXACTFL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case EXACT: case EXACTF: - /* To get here, EXACTish nodes must have *byte* length == 1. That means - * they match only characters in the string that can be expressed as a - * single byte. For non-utf8 strings, that means a simple match. For - * utf8 strings, the character matched must be an invariant, or - * downgradable to a single byte. The pattern's utf8ness is - * irrelevant, as it must be a single byte, so either it isn't utf8, or - * if it is it's an invariant */ + + /* The comments for the EXACT case apply as well to these fold ones */ c = (U8)*STRING(p); assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); - if ((! utf8_target) || UNI_IS_INVARIANT(c)) { + if (utf8_target) { /* Use full Unicode fold matching */ + + /* For the EXACTFL case, It doesn't really make sense to compare + * locale and utf8, but it is best we can do. The documents warn + * against mixing them */ + + char *tmpeol = loceol; + while (hardcount < max + && foldEQ_utf8(scan, &tmpeol, 0, utf8_target, + STRING(p), NULL, 1, UTF_PATTERN)) + { + scan = tmpeol; + tmpeol = loceol; + hardcount++; + } + + /* XXX Note that the above handles properly the German sharp s in + * the pattern matching ss in the string. But it doesn't handle + * properly cases where the string contains say 'LIGATURE ff' and + * the pattern is 'f+'. This would require, say, a new function or + * revised interface to foldEQ_utf8(), in which the maximum number + * of characters to match could be passed and it would return how + * many actually did. This is just one of many cases where + * multi-char folds don't work properly, and so the fix is being + * deferred */ + } + else { - /* Here, the string isn't utf8, or the character in the EXACT - * node is the same in utf8 as not, so can just do equality. - * Each matching char must be 1 byte long */ + /* Here, the string isn't utf8; and either the pattern isn't utf8 + * or c is an invariant, so its utf8ness doesn't affect c. Can + * just do simple comparisons for exact or fold matching. */ switch (OP(p)) { - case EXACT: - while (scan < loceol && UCHARAT(scan) == c) { - scan++; - } - break; case EXACTF: while (scan < loceol && (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c])) @@ -5806,61 +5858,6 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); } } - else { - - /* Here, the string is utf8, and the pattern char is different - * in utf8 than not. */ - - switch (OP(p)) { - case EXACT: - { - /* Fastest to find the two utf8 bytes that represent c, and - * then look for those in sequence in the utf8 string */ - U8 high = UTF8_TWO_BYTE_HI(c); - U8 low = UTF8_TWO_BYTE_LO(c); - loceol = PL_regeol; - - while (hardcount < max - && scan + 1 < loceol - && UCHARAT(scan) == high - && UCHARAT(scan + 1) == low) - { - scan += 2; - hardcount++; - } - } - break; - case EXACTFL: /* Doesn't really make sense, but is best we can - do. The documents warn against mixing locale - and utf8 */ - case EXACTF: - { /* utf8 string, so use utf8 foldEQ */ - char *tmpeol = loceol; - while (hardcount < max - && foldEQ_utf8(scan, &tmpeol, 0, utf8_target, - STRING(p), NULL, 1, UTF_PATTERN)) - { - scan = tmpeol; - tmpeol = loceol; - hardcount++; - } - - /* XXX Note that the above handles properly the German - * sharp ss in the pattern matching ss in the string. But - * it doesn't handle properly cases where the string - * contains say 'LIGATURE ff' and the pattern is 'f+'. - * This would require, say, a new function or revised - * interface to foldEQ_utf8(), in which the maximum number - * of characters to match could be passed and it would - * return how many actually did. This is just one of many - * cases where multi-char folds don't work properly, and so - * the fix is being deferred */ - } - break; - default: - Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); - } - } break; case ANYOF: if (utf8_target) { @@ -6260,7 +6257,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, match = TRUE; } - if (!match && (flags & ANYOF_CLASS)) { + if (!match && (flags & ANYOF_CLASS) && ANYOF_CLASS_TEST_ANY_SET(n)) { PL_reg_flags |= RF_tainted; if ( (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || @@ -6302,11 +6299,13 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, /* If the bitmap didn't (or couldn't) match, and something outside the * bitmap could match, try that */ - if (!match && (utf8_target || (flags & ANYOF_UNICODE))) { + if (!match) { if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { match = TRUE; } - else { + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && flags & ANYOF_UTF8)) + { AV *av; SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); @@ -6335,12 +6334,103 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, } } if (!match) { - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; + U8 folded[UTF8_MAXBYTES_CASE+1]; - STRLEN tmplen; - to_utf8_fold(utf8_p, tmpbuf, &tmplen); - if (swash_fetch(sw, tmpbuf, 1)) + /* See if the folded version matches */ + STRLEN foldlen; + to_utf8_fold(utf8_p, folded, &foldlen); + if (swash_fetch(sw, folded, 1)) { /* 1 => is utf8 */ match = TRUE; + } + else { + /* The fold in a few cases of an above Latin1 char + * is in the Latin1 range, and hence may be in the + * bitmap */ + if (UTF8_IS_INVARIANT(*folded) + && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(*folded))) + { + match = TRUE; + } + else if (UTF8_IS_DOWNGRADEABLE_START(*folded) + && ANYOF_BITMAP_TEST(n, + UNI_TO_NATIVE( + TWO_BYTE_UTF8_TO_UNI(folded[0], + folded[1])))) + { /* Since the fold comes from internally + * generated data, we can safely assume it is + * valid utf8 in the test above */ + + match = TRUE; + } + if (! match) { + SV** listp; + + /* Consider "k" =~ /[K]/i. The line above + * would have just folded the 'k' to itself, + * and that isn't going to match 'K'. So we + * look through the closure of everything that + * folds to 'k'. That will find the 'K'. + * Initialize the list, if necessary */ + if (! PL_utf8_foldclosures) { + + /* If the folds haven't been read in, call a + * fold function to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + STRLEN dummy_len; + to_utf8_fold((U8*) "A", + dummy, &dummy_len); + } + PL_utf8_foldclosures = + _swash_inversion_hash(PL_utf8_tofold); + } + + /* The data structure is a hash with the keys + * every character that is folded to, like 'k', + * and the values each an array of everything + * that folds to its key. e.g. [ 'k', 'K', + * KELVIN_SIGN ] */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) folded, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV i; + for (i = 0; i <= av_len(list); i++) { + SV** try_p = av_fetch(list, i, FALSE); + char* try_c; + if (try_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + /* Don't have to worry about embeded + * nulls since NULL isn't folded or + * foldable */ + try_c = SvPVX(*try_p); + if (UTF8_IS_INVARIANT(*try_c) + && ANYOF_BITMAP_TEST(n, + UNI_TO_NATIVE(*try_c))) + { + match = TRUE; + break; + } + else if + (UTF8_IS_DOWNGRADEABLE_START(*try_c) + && ANYOF_BITMAP_TEST(n, + UNI_TO_NATIVE( + TWO_BYTE_UTF8_TO_UNI(try_c[0], + try_c[1])))) + { + match = TRUE; + break; + } else if (swash_fetch(sw, + (U8*) try_c, 1)) + { + match = TRUE; + break; + } + } + } + } + } } }