Add Turkish locale handling to /i pattern matching
authorKarl Williamson <khw@cpan.org>
Tue, 5 Feb 2019 00:46:20 +0000 (17:46 -0700)
committerKarl Williamson <khw@cpan.org>
Tue, 5 Feb 2019 18:44:29 +0000 (11:44 -0700)
Previous commits in this series have changed uc(), lc(), fc(), etc. to
know how to handle Turkish UTF-8 locales.  This commit extends this to
/i regular expression pattern matching.

regcomp.c
regexec.c
t/re/anyof.t
utf8.c

index 547398f..b0d4710 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1651,11 +1651,26 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
     if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
         _invlist_invert(invlist);
     }
-    else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
+    else if (ANYOF_FLAGS(node) & ANYOFL_FOLD) {
+        if (new_node_has_latin1) {
 
         /* Under /li, any 0-255 could fold to any other 0-255, depending on the
          * locale.  We can skip this if there are no 0-255 at all. */
         _invlist_union(invlist, PL_Latin1, &invlist);
+
+            invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
+            invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+        }
+        else {
+            if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
+                invlist = add_cp_to_invlist(invlist, 'I');
+            }
+            if (_invlist_contains_cp(invlist,
+                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
+            {
+                invlist = add_cp_to_invlist(invlist, 'i');
+            }
+        }
     }
 
     /* Similarly add the UTF-8 locale possible matches.  These have to be
@@ -10637,9 +10652,14 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
         }
         else {
             /* Any Latin1 range character can potentially match any
-             * other depending on the locale */
+             * other depending on the locale, and in Turkic locales, U+130 and
+             * U+131 */
             if (OP(node) == EXACTFL) {
                 _invlist_union(invlist, PL_Latin1, &invlist);
+                invlist = add_cp_to_invlist(invlist,
+                                                LATIN_SMALL_LETTER_DOTLESS_I);
+                invlist = add_cp_to_invlist(invlist,
+                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
             }
             else {
                 /* But otherwise, it matches at least itself.  We can
@@ -10743,6 +10763,26 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
 
                 invlist = add_cp_to_invlist(invlist, c);
             }
+
+            if (OP(node) == EXACTFL) {
+
+                /* If either [iI] are present in an EXACTFL node the above code
+                 * should have added its normal case pair, but under a Turkish
+                 * locale they could match instead the case pairs from it.  Add
+                 * those as potential matches as well */
+                if (isALPHA_FOLD_EQ(fc, 'I')) {
+                    invlist = add_cp_to_invlist(invlist,
+                                                LATIN_SMALL_LETTER_DOTLESS_I);
+                    invlist = add_cp_to_invlist(invlist,
+                                        LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+                }
+                else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
+                    invlist = add_cp_to_invlist(invlist, 'I');
+                }
+                else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+                    invlist = add_cp_to_invlist(invlist, 'i');
+                }
+            }
         }
     }
 
@@ -18162,7 +18202,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                 only_utf8_locale_list = NULL;
             }
         }
-        if (only_utf8_locale_list) {
+        if (    only_utf8_locale_list
+            || (cp_list && (   _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)
+                            || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I))))
+        {
             has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
             anyof_flags
                  |= ANYOFL_FOLD
index fe6e0f5..ed6da24 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -4415,7 +4415,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
                 int i;
 
                 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
-                    if (isASCII(*s)) {
+                    if (isASCII(*s) && LIKELY(! PL_in_utf8_turkic_locale)) {
                         *(d++) = (U8) toFOLD_LC(*s);
                         s++;
                     }
@@ -4446,7 +4446,28 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
         }
         else { /* an EXACTFish node which doesn't begin with a multi-char fold */
             c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
-            if (c1 > 255) {
+
+            if (   UNLIKELY(PL_in_utf8_turkic_locale)
+                && OP(text_node) == EXACTFL
+                && UNLIKELY(   c1 == 'i' || c1 == 'I'
+                            || c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE
+                            || c1 == LATIN_SMALL_LETTER_DOTLESS_I))
+            {   /* Hard-coded Turkish locale rules for these 4 characters
+                   override normal rules */
+                if (c1 == 'i') {
+                    c2 = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
+                }
+                else if (c1 == 'I') {
+                    c2 = LATIN_SMALL_LETTER_DOTLESS_I;
+                }
+                else if (c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+                    c2 = 'i';
+                }
+                else if (c1 == LATIN_SMALL_LETTER_DOTLESS_I) {
+                    c2 = 'I';
+                }
+            }
+            else if (c1 > 255) {
                 const unsigned int * remaining_folds;
                 unsigned int first_fold;
 
@@ -9781,7 +9802,23 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
                    utf8_p = utf8_buffer;
                }
 
-               if (swash_fetch(sw, utf8_p, TRUE)) {
+                /* Turkish locales have these hard-coded rules overriding
+                 * normal ones */
+                if (   UNLIKELY(PL_in_utf8_turkic_locale)
+                    && isALPHA_FOLD_EQ(*p, 'i'))
+                {
+                    if (*p == 'i') {
+                        if (swash_fetch(sw, (const U8 *) LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8, TRUE)) {
+                            match = TRUE;
+                        }
+                    }
+                    else if (*p == 'I') {
+                        if (swash_fetch(sw, (const U8 *) LATIN_SMALL_LETTER_DOTLESS_I_UTF8, TRUE)) {
+                            match = TRUE;
+                        }
+                    }
+                }
+                else if (swash_fetch(sw, utf8_p, TRUE)) {
                    match = TRUE;
                 }
            }
@@ -9790,6 +9827,25 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const
             }
        }
 
+        /* In a Turkic locale under folding, hard-code the I i case pair
+         * matches */
+        if (     UNLIKELY(PL_in_utf8_turkic_locale)
+            && ! match
+            &&   (flags & ANYOFL_FOLD)
+            &&   utf8_target)
+        {
+            if (c == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+               if (ANYOF_BITMAP_TEST(n, 'i')) {
+                    match = TRUE;
+                }
+            }
+            else if (c == LATIN_SMALL_LETTER_DOTLESS_I) {
+               if (ANYOF_BITMAP_TEST(n, 'I')) {
+                    match = TRUE;
+                }
+            }
+        }
+
         if (UNICODE_IS_SUPER(c)
             && (flags
                & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
index 116c238..3656224 100644 (file)
@@ -454,7 +454,7 @@ my @tests = (
     '(?i)[\d\w]' => 'POSIXD[\w]',
     '(?i)[\D\w]' => 'SANY',
     #'(?i)(?l)[\d\w]' => varies depending on Unicode release
-    '(?i)(?l)[\D\w]' => 'ANYOFPOSIXL[\\w\\D][0100-INFTY]',
+    '(?i)(?l)[\D\w]' => 'ANYOFPOSIXL{i}[\\w\\D][0100-INFTY]',
     '(?i)(?u)[\d\w]' => 'POSIXU[\w]',
     '(?i)(?u)[\D\w]' => 'SANY',
     '(?i)(?a)[\d\w]' => 'POSIXA[\w]',
diff --git a/utf8.c b/utf8.c
index 94b6801..6354f85 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -5644,7 +5644,12 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1,
 
     if (flags & FOLDEQ_LOCALE) {
         if (IN_UTF8_CTYPE_LOCALE) {
-            flags &= ~FOLDEQ_LOCALE;
+            if (UNLIKELY(PL_in_utf8_turkic_locale)) {
+                flags_for_folder |= FOLD_FLAGS_LOCALE;
+            }
+            else {
+                flags &= ~FOLDEQ_LOCALE;
+            }
         }
         else {
             flags_for_folder |= FOLD_FLAGS_LOCALE;