X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/49fb50ea6e42cacccde2683fbed0b89f9eeb16cf..33bc847050ad68bb79f1e04db9100e25017348e1:/regexec.c diff --git a/regexec.c b/regexec.c index 2a5fa54..75d58ce 100644 --- a/regexec.c +++ b/regexec.c @@ -425,10 +425,8 @@ S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH) #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ -#ifndef PERL_IN_XSUB_RE - -bool -Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character) +STATIC bool +S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) { /* Returns a boolean as to whether or not 'character' is a member of the * Posix character class given by 'classnum' that should be equivalent to a @@ -468,8 +466,6 @@ Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character) return FALSE; } -#endif - PERL_STATIC_INLINE I32 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len) { @@ -4527,7 +4523,7 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node, /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e */ - U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { 0 }; + U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } }; U8 lengths[MAX_MATCHES] = { 0 }; U8 index_of_longest = 0; @@ -4694,24 +4690,37 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node, * * Everything generally matches at least itself. But if there is a * UTF8ness mismatch, we have to convert to that of the target string. */ - if (utf8_pat == utf8_target || UTF8_IS_INVARIANT(*pat)) { - lengths[0] = MIN(pat_len, C_ARRAY_LENGTH(matches[0])); - Copy(pat, matches[0], lengths[0], U8); - m->count++; - } - else if (utf8_target) { /* target is UTF-8; pattern isn't */ - matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]); - matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]); - lengths[0] = 2; + if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */ + matches[0][0] = pat[0]; + lengths[0] = 1; m->count++; } - else { /* pattern is UTF-8, target isn't */ - if (UTF8_IS_DOWNGRADEABLE_START(*pat)) { - matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]); - lengths[0] = 1; + else if (utf8_target) { + if (utf8_pat) { + lengths[0] = UTF8SKIP(pat); + Copy(pat, matches[0], lengths[0], U8); + m->count++; + } + else { /* target is UTF-8, pattern isn't */ + matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]); + matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]); + lengths[0] = 2; m->count++; } } + else if (! utf8_pat) { /* Neither is UTF-8 */ + matches[0][0] = pat[0]; + lengths[0] = 1; + m->count++; + } + else /* target isn't UTF-8; pattern is. No match possible unless the + pattern's first character can fit in a byte */ + if (UTF8_IS_DOWNGRADEABLE_START(*pat)) + { + matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]); + lengths[0] = 1; + m->count++; + } /* Here we have taken care of any necessary node-type changes */ @@ -4849,8 +4858,8 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node, lengths[m->count] = UVCHR_SKIP(fold_from); m->count++; } - else { /* Non-UTF8 target: any code point above 255 - can't appear in it */ + else { /* Non-UTF8 target: no code point above 255 can appear in it + */ if (fold_from > 255) { continue; } @@ -4973,7 +4982,10 @@ S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node, if (m->count > 1) { /* No need to sort a single entry */ for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) { - /* Keep the same order for all but the longest */ + /* Keep the same order for all but the longest. (If the + * asserts fail, it could be because m->matches is declared too + * short, either because of a new Unicode release, or an + * overlooked test case, or it could be a bug.) */ if (i != index_of_longest) { assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches)); Copy(matches[i], m->matches + cur_pos, lengths[i], U8);