if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
_invlist_invert(invlist);
}
- else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
+ else if (ANYOF_FLAGS(node) & ANYOFL_FOLD) {
+ if (new_node_has_latin1) {
/* Under /li, any 0-255 could fold to any other 0-255, depending on the
* locale. We can skip this if there are no 0-255 at all. */
_invlist_union(invlist, PL_Latin1, &invlist);
+
+ invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
+ invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ }
+ else {
+ if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
+ invlist = add_cp_to_invlist(invlist, 'I');
+ }
+ if (_invlist_contains_cp(invlist,
+ LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
+ {
+ invlist = add_cp_to_invlist(invlist, 'i');
+ }
+ }
}
/* Similarly add the UTF-8 locale possible matches. These have to be
}
else {
/* Any Latin1 range character can potentially match any
- * other depending on the locale */
+ * other depending on the locale, and in Turkic locales, U+130 and
+ * U+131 */
if (OP(node) == EXACTFL) {
_invlist_union(invlist, PL_Latin1, &invlist);
+ invlist = add_cp_to_invlist(invlist,
+ LATIN_SMALL_LETTER_DOTLESS_I);
+ invlist = add_cp_to_invlist(invlist,
+ LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
}
else {
/* But otherwise, it matches at least itself. We can
invlist = add_cp_to_invlist(invlist, c);
}
+
+ if (OP(node) == EXACTFL) {
+
+ /* If either [iI] are present in an EXACTFL node the above code
+ * should have added its normal case pair, but under a Turkish
+ * locale they could match instead the case pairs from it. Add
+ * those as potential matches as well */
+ if (isALPHA_FOLD_EQ(fc, 'I')) {
+ invlist = add_cp_to_invlist(invlist,
+ LATIN_SMALL_LETTER_DOTLESS_I);
+ invlist = add_cp_to_invlist(invlist,
+ LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ }
+ else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
+ invlist = add_cp_to_invlist(invlist, 'I');
+ }
+ else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+ invlist = add_cp_to_invlist(invlist, 'i');
+ }
+ }
}
}
only_utf8_locale_list = NULL;
}
}
- if (only_utf8_locale_list) {
+ if ( only_utf8_locale_list
+ || (cp_list && ( _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
+ || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
+ {
has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
anyof_flags
|= ANYOFL_FOLD
int i;
for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
- if (isASCII(*s)) {
+ if (isASCII(*s) && LIKELY(! PL_in_utf8_turkic_locale)) {
*(d++) = (U8) toFOLD_LC(*s);
s++;
}
}
else { /* an EXACTFish node which doesn't begin with a multi-char fold */
c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
- if (c1 > 255) {
+
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && OP(text_node) == EXACTFL
+ && UNLIKELY( c1 == 'i' || c1 == 'I'
+ || c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE
+ || c1 == LATIN_SMALL_LETTER_DOTLESS_I))
+ { /* Hard-coded Turkish locale rules for these 4 characters
+ override normal rules */
+ if (c1 == 'i') {
+ c2 = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
+ }
+ else if (c1 == 'I') {
+ c2 = LATIN_SMALL_LETTER_DOTLESS_I;
+ }
+ else if (c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+ c2 = 'i';
+ }
+ else if (c1 == LATIN_SMALL_LETTER_DOTLESS_I) {
+ c2 = 'I';
+ }
+ }
+ else if (c1 > 255) {
const unsigned int * remaining_folds;
unsigned int first_fold;
utf8_p = utf8_buffer;
}
- if (swash_fetch(sw, utf8_p, TRUE)) {
+ /* Turkish locales have these hard-coded rules overriding
+ * normal ones */
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && isALPHA_FOLD_EQ(*p, 'i'))
+ {
+ if (*p == 'i') {
+ if (swash_fetch(sw, (const U8 *) LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8, TRUE)) {
+ match = TRUE;
+ }
+ }
+ else if (*p == 'I') {
+ if (swash_fetch(sw, (const U8 *) LATIN_SMALL_LETTER_DOTLESS_I_UTF8, TRUE)) {
+ match = TRUE;
+ }
+ }
+ }
+ else if (swash_fetch(sw, utf8_p, TRUE)) {
match = TRUE;
}
}
}
}
+ /* In a Turkic locale under folding, hard-code the I i case pair
+ * matches */
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && ! match
+ && (flags & ANYOFL_FOLD)
+ && utf8_target)
+ {
+ if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+ if (ANYOF_BITMAP_TEST(n, 'i')) {
+ match = TRUE;
+ }
+ }
+ else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
+ if (ANYOF_BITMAP_TEST(n, 'I')) {
+ match = TRUE;
+ }
+ }
+ }
+
if (UNICODE_IS_SUPER(c)
&& (flags
& ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)