This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: Move macro defns, comments adjacent to fcn
[perl5.git] / pp.c
diff --git a/pp.c b/pp.c
index 533dcc1..c7fa231 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -28,6 +28,7 @@
 #include "perl.h"
 #include "keywords.h"
 
+#include "invlist_inline.h"
 #include "reentr.h"
 #include "regcharclass.h"
 
@@ -1693,7 +1694,8 @@ PP(pp_repeat)
            else {
                dTOPss;
                ASSUME(MARK + 1 == SP);
-               XPUSHs(sv);
+                MEXTEND(SP, 1);
+                PUSHs(sv);
                MARK[1] = &PL_sv_undef;
            }
            SP = MARK + 2;
@@ -2052,7 +2054,7 @@ PP(pp_lt)
     dSP;
     SV *left, *right;
 
-    tryAMAGICbin_MG(lt_amg, AMGf_set|AMGf_numeric);
+    tryAMAGICbin_MG(lt_amg, AMGf_numeric);
     right = POPs;
     left  = TOPs;
     SETs(boolSV(
@@ -2068,7 +2070,7 @@ PP(pp_gt)
     dSP;
     SV *left, *right;
 
-    tryAMAGICbin_MG(gt_amg, AMGf_set|AMGf_numeric);
+    tryAMAGICbin_MG(gt_amg, AMGf_numeric);
     right = POPs;
     left  = TOPs;
     SETs(boolSV(
@@ -2084,7 +2086,7 @@ PP(pp_le)
     dSP;
     SV *left, *right;
 
-    tryAMAGICbin_MG(le_amg, AMGf_set|AMGf_numeric);
+    tryAMAGICbin_MG(le_amg, AMGf_numeric);
     right = POPs;
     left  = TOPs;
     SETs(boolSV(
@@ -2100,7 +2102,7 @@ PP(pp_ge)
     dSP;
     SV *left, *right;
 
-    tryAMAGICbin_MG(ge_amg, AMGf_set|AMGf_numeric);
+    tryAMAGICbin_MG(ge_amg, AMGf_numeric);
     right = POPs;
     left  = TOPs;
     SETs(boolSV(
@@ -2116,7 +2118,7 @@ PP(pp_ne)
     dSP;
     SV *left, *right;
 
-    tryAMAGICbin_MG(ne_amg, AMGf_set|AMGf_numeric);
+    tryAMAGICbin_MG(ne_amg, AMGf_numeric);
     right = POPs;
     left  = TOPs;
     SETs(boolSV(
@@ -2248,7 +2250,7 @@ PP(pp_sle)
        break;
     }
 
-    tryAMAGICbin_MG(amg_type, AMGf_set);
+    tryAMAGICbin_MG(amg_type, 0);
     {
       dPOPTOPssrl;
       const int cmp =
@@ -2266,7 +2268,7 @@ PP(pp_sle)
 PP(pp_seq)
 {
     dSP;
-    tryAMAGICbin_MG(seq_amg, AMGf_set);
+    tryAMAGICbin_MG(seq_amg, 0);
     {
       dPOPTOPssrl;
       SETs(boolSV(sv_eq_flags(left, right, 0)));
@@ -2277,7 +2279,7 @@ PP(pp_seq)
 PP(pp_sne)
 {
     dSP;
-    tryAMAGICbin_MG(sne_amg, AMGf_set);
+    tryAMAGICbin_MG(sne_amg, 0);
     {
       dPOPTOPssrl;
       SETs(boolSV(!sv_eq_flags(left, right, 0)));
@@ -2512,7 +2514,7 @@ PP(pp_not)
     dSP;
     SV *sv;
 
-    tryAMAGICun_MG(not_amg, AMGf_set);
+    tryAMAGICun_MG(not_amg, 0);
     sv = *PL_stack_sp;
     *PL_stack_sp = boolSV(!SvTRUE_nomg_NN(sv));
     return NORMAL;
@@ -2709,7 +2711,7 @@ PP(pp_i_subtract)
 PP(pp_i_lt)
 {
     dSP;
-    tryAMAGICbin_MG(lt_amg, AMGf_set);
+    tryAMAGICbin_MG(lt_amg, 0);
     {
       dPOPTOPiirl_nomg;
       SETs(boolSV(left < right));
@@ -2720,7 +2722,7 @@ PP(pp_i_lt)
 PP(pp_i_gt)
 {
     dSP;
-    tryAMAGICbin_MG(gt_amg, AMGf_set);
+    tryAMAGICbin_MG(gt_amg, 0);
     {
       dPOPTOPiirl_nomg;
       SETs(boolSV(left > right));
@@ -2731,7 +2733,7 @@ PP(pp_i_gt)
 PP(pp_i_le)
 {
     dSP;
-    tryAMAGICbin_MG(le_amg, AMGf_set);
+    tryAMAGICbin_MG(le_amg, 0);
     {
       dPOPTOPiirl_nomg;
       SETs(boolSV(left <= right));
@@ -2742,7 +2744,7 @@ PP(pp_i_le)
 PP(pp_i_ge)
 {
     dSP;
-    tryAMAGICbin_MG(ge_amg, AMGf_set);
+    tryAMAGICbin_MG(ge_amg, 0);
     {
       dPOPTOPiirl_nomg;
       SETs(boolSV(left >= right));
@@ -2753,7 +2755,7 @@ PP(pp_i_ge)
 PP(pp_i_eq)
 {
     dSP;
-    tryAMAGICbin_MG(eq_amg, AMGf_set);
+    tryAMAGICbin_MG(eq_amg, 0);
     {
       dPOPTOPiirl_nomg;
       SETs(boolSV(left == right));
@@ -2764,7 +2766,7 @@ PP(pp_i_eq)
 PP(pp_i_ne)
 {
     dSP;
-    tryAMAGICbin_MG(ne_amg, AMGf_set);
+    tryAMAGICbin_MG(ne_amg, 0);
     {
       dPOPTOPiirl_nomg;
       SETs(boolSV(left != right));
@@ -3708,6 +3710,7 @@ PP(pp_ucfirst)
     STRLEN tculen;  /* tculen is the byte length of the freshly titlecased (or
                     * lowercased) character stored in tmpbuf.  May be either
                     * UTF-8 or not, but in either case is the number of bytes */
+    bool remove_dot_above = FALSE;
 
     s = (const U8*)SvPV_const(source, slen);
 
@@ -3748,7 +3751,37 @@ PP(pp_ucfirst)
 #ifdef USE_LOCALE_CTYPE
 
            _toLOWER_utf8_flags(s, s + slen, tmpbuf, &tculen, IN_LC_RUNTIME(LC_CTYPE));
+
+            /* In turkic locales, lower casing an 'I' normally yields U+0131,
+             * LATIN SMALL LETTER DOTLESS I, but not if the grapheme also
+             * contains a COMBINING DOT ABOVE.  Instead it is treated like
+             * LATIN CAPITAL LETTER I WITH DOT ABOVE lowercased to 'i'.  The
+             * call to lowercase above has handled this.  But SpecialCasing.txt
+             * says we are supposed to remove the COMBINING DOT ABOVE.  We can
+             * tell if we have this situation if I ==> i in a turkic locale. */
+            if (   UNLIKELY(PL_in_utf8_turkic_locale)
+                && IN_LC_RUNTIME(LC_CTYPE)
+                && (UNLIKELY(*s == 'I' && tmpbuf[0] == 'i')))
+            {
+                /* Here, we know there was a COMBINING DOT ABOVE.  We won't be
+                 * able to handle this in-place. */
+                inplace = FALSE;
+
+                /* It seems likely that the DOT will immediately follow the
+                 * 'I'.  If so, we can remove it simply by indicating to the
+                 * code below to start copying the source just beyond the DOT.
+                 * We know its length is 2 */
+                if (LIKELY(memBEGINs(s + 1, s + slen, COMBINING_DOT_ABOVE_UTF8))) {
+                    ulen += 2;
+                }
+                else {  /* But if it doesn't follow immediately, set a flag for
+                           the code below */
+                    remove_dot_above = TRUE;
+                }
+            }
 #else
+            PERL_UNUSED_VAR(remove_dot_above);
+
            _toLOWER_utf8_flags(s, s + slen, tmpbuf, &tculen, 0);
 #endif
 
@@ -3766,41 +3799,63 @@ PP(pp_ucfirst)
                         * need to be overridden for the tricky ones */
        need = slen + 1;
 
-       if (op_type == OP_LCFIRST) {
 
-           /* lower case the first letter: no trickiness for any character */
 #ifdef USE_LOCALE_CTYPE
-            if (IN_LC_RUNTIME(LC_CTYPE)) {
-                *tmpbuf = toLOWER_LC(*s);
-            }
-            else
-#endif
+
+        if (IN_LC_RUNTIME(LC_CTYPE)) {
+            if (    UNLIKELY(PL_in_utf8_turkic_locale)
+                && (   (op_type == OP_LCFIRST && UNLIKELY(*s == 'I'))
+                    || (op_type == OP_UCFIRST && UNLIKELY(*s == 'i'))))
             {
-                *tmpbuf = (IN_UNI_8_BIT)
-                          ? toLOWER_LATIN1(*s)
-                          : toLOWER(*s);
+                if (*s == 'I') { /* lcfirst('I') */
+                    tmpbuf[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
+                    tmpbuf[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
+                }
+                else {  /* ucfirst('i') */
+                    tmpbuf[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+                    tmpbuf[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+                }
+                tculen = 2;
+                inplace = FALSE;
+                doing_utf8 = TRUE;
+                convert_source_to_utf8 = TRUE;
+                need += variant_under_utf8_count(s, s + slen);
             }
-       }
-#ifdef USE_LOCALE_CTYPE
-       /* is ucfirst() */
-       else if (IN_LC_RUNTIME(LC_CTYPE)) {
-            if (IN_UTF8_CTYPE_LOCALE) {
-                goto do_uni_rules;
+            else if (op_type == OP_LCFIRST) {
+
+                /* For lc, there are no gotchas for UTF-8 locales (other than
+                 * the turkish ones already handled above) */
+                *tmpbuf = toLOWER_LC(*s);
             }
+            else { /* ucfirst */
 
-            *tmpbuf = (U8) toUPPER_LC(*s); /* This would be a bug if any
-                                              locales have upper and title case
-                                              different */
-       }
+                /* But for uc, some characters require special handling */
+                if (IN_UTF8_CTYPE_LOCALE) {
+                    goto do_uni_rules;
+                }
+
+                /* This would be a bug if any locales have upper and title case
+                 * different */
+                *tmpbuf = (U8) toUPPER_LC(*s);
+            }
+        }
+        else
 #endif
-       else if (! IN_UNI_8_BIT) {
-           *tmpbuf = toUPPER(*s);      /* Returns caseless for non-ascii, or
-                                        * on EBCDIC machines whatever the
-                                        * native function does */
-       }
+        /* Here, not in locale.  If not using Unicode rules, is a simple
+         * lower/upper, depending */
+        if (! IN_UNI_8_BIT) {
+            *tmpbuf = (op_type == OP_LCFIRST)
+                      ? toLOWER(*s)
+                      : toUPPER(*s);
+        }
+        else if (op_type == OP_LCFIRST) {
+            /* lower case the first letter: no trickiness for any character */
+            *tmpbuf = toLOWER_LATIN1(*s);
+        }
         else {
             /* Here, is ucfirst non-UTF-8, not in locale (unless that locale is
-             * UTF-8, which we treat as not in locale), and cased latin1 */
+             * non-turkic UTF-8, which we treat as not in locale), and cased
+             * latin1 */
            UV title_ord;
 #ifdef USE_LOCALE_CTYPE
       do_uni_rules:
@@ -3836,7 +3891,7 @@ PP(pp_ucfirst)
                             + 1;
 
                         /* The (converted) UTF-8 and UTF-EBCDIC lengths of all
-                         * (both) characters whose title case is above 255 is
+                         * characters whose title case is above 255 is
                          * 2. */
                        ulen = 2;
                    }
@@ -3880,6 +3935,29 @@ PP(pp_ucfirst)
                 * of the string. */
                sv_setpvn(dest, (char*)tmpbuf, tculen);
                if (slen > ulen) {
+
+                    /* But this boolean being set means we are in a turkic
+                     * locale, and there is a DOT character that needs to be
+                     * removed, and it isn't immediately after the current
+                     * character.  Keep concatenating characters to the output
+                     * one at a time, until we find the DOT, which we simply
+                     * skip */
+                    if (UNLIKELY(remove_dot_above)) {
+                        do {
+                            Size_t this_len = UTF8SKIP(s + ulen);
+
+                            sv_catpvn(dest, (char*)(s + ulen), this_len);
+
+                            ulen += this_len;
+                            if (memBEGINs(s + ulen, s + slen, COMBINING_DOT_ABOVE_UTF8)) {
+                                ulen += 2;
+                                break;
+                            }
+                        } while (s + ulen < s + slen);
+                    }
+
+                    /* The rest of the string can be concatenated unchanged,
+                     * all at once */
                    sv_catpvn(dest, (char*)(s + ulen), slen - ulen);
                }
            }
@@ -3891,7 +3969,7 @@ PP(pp_ucfirst)
                 * into tmpbuf.  First put that into dest, and then append the
                 * rest of the source, converting it to UTF-8 as we go. */
 
-               /* Assert tculen is 2 here because the only two characters that
+               /* Assert tculen is 2 here because the only characters that
                 * get to this part of the code have 2-byte UTF-8 equivalents */
                 assert(tculen == 2);
                *d++ = *tmpbuf;
@@ -3957,6 +4035,7 @@ PP(pp_ucfirst)
 
 PP(pp_uc)
 {
+    dVAR;
     dSP;
     SV *source = TOPs;
     STRLEN len;
@@ -4039,12 +4118,16 @@ PP(pp_uc)
            STRLEN u;
            STRLEN ulen;
            UV uv;
-           if (in_iota_subscript && ! _is_utf8_mark(s)) {
+           if (UNLIKELY(in_iota_subscript)) {
+                UV cp = utf8_to_uvchr_buf(s, send, NULL);
+
+                if (! _invlist_contains_cp(PL_utf8_mark, cp)) {
 
-               /* A non-mark.  Time to output the iota subscript */
-               *d++ = UTF8_TWO_BYTE_HI(GREEK_CAPITAL_LETTER_IOTA);
-               *d++ = UTF8_TWO_BYTE_LO(GREEK_CAPITAL_LETTER_IOTA);
-               in_iota_subscript = FALSE;
+                    /* A non-mark.  Time to output the iota subscript */
+                    *d++ = UTF8_TWO_BYTE_HI(GREEK_CAPITAL_LETTER_IOTA);
+                    *d++ = UTF8_TWO_BYTE_LO(GREEK_CAPITAL_LETTER_IOTA);
+                    in_iota_subscript = FALSE;
+                }
             }
 
             /* Then handle the current character.  Get the changed case value
@@ -4119,15 +4202,24 @@ PP(pp_uc)
                     Size_t extra;
 
                    *d = toUPPER_LATIN1_MOD(*s);
-                   if (LIKELY(*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
+                   if (   LIKELY(*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)
+
+#ifdef USE_LOCALE_CTYPE
+
+                        && (LIKELY(   ! PL_in_utf8_turkic_locale
+                                   || ! IN_LC_RUNTIME(LC_CTYPE))
+                                   || *s != 'i')
+#endif
+
+                    ) {
                         continue;
                     }
 
                    /* The mainstream case is the tight loop above.  To avoid
-                    * extra tests in that, all three characters that require
-                    * special handling are mapped by the MOD to the one tested
-                    * just above.  
-                    * Use the source to distinguish between the three cases */
+                     * extra tests in that, all three characters that always
+                     * require special handling are mapped by the MOD to the
+                     * one tested just above.  Use the source to distinguish
+                     * between those cases */
 
 #if    UNICODE_MAJOR_VERSION > 2                                        \
    || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1          \
@@ -4145,7 +4237,7 @@ PP(pp_uc)
                    }
 #endif
 
-                   /* The other two special handling characters have their
+                   /* The other special handling characters have their
                     * upper cases outside the latin1 range, hence need to be
                     * in UTF-8, so the whole result needs to be in UTF-8.
                      *
@@ -4171,14 +4263,32 @@ PP(pp_uc)
                      * not require much extra code.
                      *
                      * First, calculate the extra space needed for the
-                     * remainder of the source needing to be in UTF-8.  The
+                     * remainder of the source needing to be in UTF-8.  Except
+                     * for the 'i' in Turkic locales, in UTF-8 strings, the
                      * uppercase of a character below 256 occupies the same
                      * number of bytes as the original.  Therefore, the space
                      * needed is the that number plus the number of characters
-                     * that become two bytes when converted to UTF-8. */
+                     * that become two bytes when converted to UTF-8, plus, in
+                     * turkish locales, the number of 'i's. */
 
                     extra = send - s + variant_under_utf8_count(s, send);
 
+#ifdef USE_LOCALE_CTYPE
+
+                    if (UNLIKELY(*s == 'i')) {  /* We wouldn't get an 'i' here
+                                                   unless are in a Turkic
+                                                   locale */
+                        const U8 * s_peek = s;
+
+                        do {
+                            extra++;
+
+                            s_peek = (U8 *) memchr(s_peek + 1, 'i',
+                                                   send - (s_peek + 1));
+                        } while (s_peek != NULL);
+                    }
+#endif
+
                     /* Convert what we have so far into UTF-8, telling the
                     * function that we know it should be converted, and to
                     * allow extra space for what we haven't processed yet.
@@ -4190,20 +4300,42 @@ PP(pp_uc)
                    SvCUR_set(dest, len);
                    len = sv_utf8_upgrade_flags_grow(dest,
                                                SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                               extra);
+                                                extra
+                                              + 1 /* trailing NUL */ );
                    d = (U8*)SvPVX(dest) + len;
 
                     /* Now process the remainder of the source, simultaneously
-                     * converting to upper and UTF-8. */
-                   for (; s < send; s++) {
-                       (void) _to_upper_title_latin1(*s, d, &len, 'S');
-                       d += len;
-                   }
-
-                   /* Here have processed the whole source; no need to continue
-                    * with the outer loop.  Each character has been converted
-                    * to upper case and converted to UTF-8 */
+                     * converting to upper and UTF-8.
+                     *
+                     * To avoid extra tests in the loop body, and since the
+                     * loop is so simple, split out the rare Turkic case into
+                     * its own loop */
 
+#ifdef USE_LOCALE_CTYPE
+                    if (   UNLIKELY(PL_in_utf8_turkic_locale)
+                        && UNLIKELY(IN_LC_RUNTIME(LC_CTYPE)))
+                    {
+                        for (; s < send; s++) {
+                            if (*s == 'i') {
+                                *d++ = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+                                *d++ = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+                            }
+                            else {
+                                (void) _to_upper_title_latin1(*s, d, &len, 'S');
+                                d += len;
+                            }
+                        }
+                    }
+                    else
+#endif
+                        for (; s < send; s++) {
+                            (void) _to_upper_title_latin1(*s, d, &len, 'S');
+                            d += len;
+                        }
+
+                    /* Here have processed the whole source; no need to
+                     * continue with the outer loop.  Each character has been
+                     * converted to upper case and converted to UTF-8. */
                    break;
                } /* End of processing all latin1-style chars */
            } /* End of processing all chars */
@@ -4235,15 +4367,26 @@ PP(pp_lc)
     SV *dest;
     const U8 *s;
     U8 *d;
+    bool has_turkic_I = FALSE;
 
     SvGETMAGIC(source);
 
     if (   SvPADTMP(source)
        && !SvREADONLY(source) && SvPOK(source)
-       && !DO_UTF8(source)) {
+       && !DO_UTF8(source)
+
+#ifdef USE_LOCALE_CTYPE
+
+        && (   LIKELY(! IN_LC_RUNTIME(LC_CTYPE))
+            || LIKELY(! PL_in_utf8_turkic_locale))
 
-       /* We can convert in place, as lowercasing anything in the latin1 range
-        * (or else DO_UTF8 would have been on) doesn't lengthen it */
+#endif
+
+    ) {
+
+        /* We can convert in place, as, outside of Turkic UTF-8 locales,
+         * lowercasing anything in the latin1 range (or else DO_UTF8 would have
+         * been on) doesn't lengthen it. */
        dest = source;
        s = d = (U8*)SvPV_force_nomg(source, len);
        min = len + 1;
@@ -4265,7 +4408,38 @@ PP(pp_lc)
 #ifdef USE_LOCALE_CTYPE
 
     if (IN_LC_RUNTIME(LC_CTYPE)) {
+        const U8 * next_I;
+
         _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+        /* Lowercasing in a Turkic locale can cause non-UTF-8 to need to become
+         * UTF-8 for the single case of the character 'I' */
+        if (     UNLIKELY(PL_in_utf8_turkic_locale)
+            && ! DO_UTF8(source)
+            &&   (next_I = (U8 *) memchr(s, 'I', len)))
+        {
+            Size_t I_count = 0;
+            const U8 *const send = s + len;
+
+            do {
+                I_count++;
+
+                next_I = (U8 *) memchr(next_I + 1, 'I',
+                                        send - (next_I + 1));
+            } while (next_I != NULL);
+
+            /* Except for the 'I', in UTF-8 strings, the lower case of a
+             * character below 256 occupies the same number of bytes as the
+             * original.  Therefore, the space needed is the original length
+             * plus I_count plus the number of characters that become two bytes
+             * when converted to UTF-8 */
+            sv_utf8_upgrade_flags_grow(dest, 0, len
+                                              + I_count
+                                              + variant_under_utf8_count(s, send)
+                                              + 1 /* Trailing NUL */ );
+            d = (U8*)SvPVX(dest);
+            has_turkic_I = TRUE;
+        }
     }
 
 #endif
@@ -4276,6 +4450,7 @@ PP(pp_lc)
     if (DO_UTF8(source)) {
        const U8 *const send = s + len;
        U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+        bool remove_dot_above = FALSE;
 
        while (s < send) {
            const STRLEN u = UTF8SKIP(s);
@@ -4284,7 +4459,33 @@ PP(pp_lc)
 #ifdef USE_LOCALE_CTYPE
 
            _toLOWER_utf8_flags(s, send, tmpbuf, &ulen, IN_LC_RUNTIME(LC_CTYPE));
+
+            /* If we are in a Turkic locale, we have to do more work.  As noted
+             * in the comments for lcfirst, there is a special case if a 'I'
+             * is in a grapheme with COMBINING DOT ABOVE UTF8.  It turns into a
+             * 'i', and the DOT must be removed.  We check for that situation,
+             * and set a flag if the DOT is there.  Then each time through the
+             * loop, we have to see if we need to remove the next DOT above,
+             * and if so, do it.  We know that there is a DOT because
+             * _toLOWER_utf8_flags() wouldn't have returned 'i' unless there
+             * was one in a proper position. */
+            if (   UNLIKELY(PL_in_utf8_turkic_locale)
+                && IN_LC_RUNTIME(LC_CTYPE))
+            {
+                if (   UNLIKELY(remove_dot_above)
+                    && memBEGINs(tmpbuf, sizeof(tmpbuf), COMBINING_DOT_ABOVE_UTF8))
+                {
+                    s += u;
+                    remove_dot_above = FALSE;
+                    continue;
+                }
+                else if (UNLIKELY(*s == 'I' && tmpbuf[0] == 'i')) {
+                    remove_dot_above = TRUE;
+                }
+            }
 #else
+            PERL_UNUSED_VAR(remove_dot_above);
+
            _toLOWER_utf8_flags(s, send, tmpbuf, &ulen, 0);
 #endif
 
@@ -4316,7 +4517,7 @@ PP(pp_lc)
        SvUTF8_on(dest);
        *d = '\0';
        SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
-    } else {   /* Not utf8 */
+    } else {   /* 'source' not utf8 */
        if (len) {
            const U8 *const send = s + len;
 
@@ -4325,8 +4526,22 @@ PP(pp_lc)
             * whole thing in a tight loop, for speed, */
 #ifdef USE_LOCALE_CTYPE
             if (IN_LC_RUNTIME(LC_CTYPE)) {
-               for (; s < send; d++, s++)
-                   *d = toLOWER_LC(*s);
+                if (LIKELY( ! has_turkic_I)) {
+                    for (; s < send; d++, s++)
+                        *d = toLOWER_LC(*s);
+                }
+                else {  /* This is the only case where lc() converts 'dest'
+                           into UTF-8 from a non-UTF-8 'source' */
+                    for (; s < send; s++) {
+                        if (*s == 'I') {
+                            *d++ = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
+                            *d++ = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
+                        }
+                        else {
+                            append_utf8_from_native_byte(toLOWER_LATIN1(*s), &d);
+                        }
+                    }
+                }
             }
            else
 #endif
@@ -4534,33 +4749,55 @@ PP(pp_fc)
 #ifdef USE_LOCALE_CTYPE
       do_uni_folding:
 #endif
-            /* For ASCII and the Latin-1 range, there's two
+            /* For ASCII and the Latin-1 range, there's potentially three
              * troublesome folds:
              *      \x{DF} (\N{LATIN SMALL LETTER SHARP S}), which under full
              *             casefolding becomes 'ss';
              *      \x{B5} (\N{MICRO SIGN}), which under any fold becomes
              *             \x{3BC} (\N{GREEK SMALL LETTER MU})
+             *      I      only in Turkic locales, this folds to \x{131}
+             *             \N{LATIN SMALL LETTER DOTLESS I}
              * For the rest, the casefold is their lowercase.  */
             for (; s < send; d++, s++) {
-                if (*s == MICRO_SIGN) {
+                if (    UNLIKELY(*s == MICRO_SIGN)
+#ifdef USE_LOCALE_CTYPE
+                    || (   UNLIKELY(PL_in_utf8_turkic_locale)
+                        && UNLIKELY(IN_LC_RUNTIME(LC_CTYPE))
+                        && UNLIKELY(*s == 'I'))
+#endif
+                ) {
                     Size_t extra = send - s
                                  + variant_under_utf8_count(s, send);
 
                     /* \N{MICRO SIGN}'s casefold is \N{GREEK SMALL LETTER MU},
-                     * which is outside of the latin-1 range. There's a couple
-                     * of ways to deal with this -- khw discusses them in
-                     * pp_lc/uc, so go there :) What we do here is upgrade what
-                     * we had already casefolded, then enter an inner loop that
-                     * appends the rest of the characters as UTF-8.
+                     * and 'I' in Turkic locales is \N{LATIN SMALL LETTER
+                     * DOTLESS I} both of which are outside of the latin-1
+                     * range. There's a couple of ways to deal with this -- khw
+                     * discusses them in pp_lc/uc, so go there :) What we do
+                     * here is upgrade what we had already casefolded, then
+                     * enter an inner loop that appends the rest of the
+                     * characters as UTF-8.
                      *
                      * First we calculate the needed size of the upgraded dest
                      * beyond what's been processed already (the upgrade
-                     * function figures that out).  In UTF-8 strings, the fold case of a
+                     * function figures that out).  Except for the 'I' in
+                     * Turkic locales, in UTF-8 strings, the fold case of a
                      * character below 256 occupies the same number of bytes as
                      * the original (even the Sharp S).  Therefore, the space
                      * needed is the number of bytes remaining plus the number
                      * of characters that become two bytes when converted to
-                     * UTF-8. */
+                     * UTF-8 plus, in turkish locales, the number of 'I's */
+
+                    if (UNLIKELY(*s == 'I')) {
+                        const U8 * s_peek = s;
+
+                        do {
+                            extra++;
+
+                            s_peek = (U8 *) memchr(s_peek + 1, 'i',
+                                                   send - (s_peek + 1));
+                        } while (s_peek != NULL);
+                    }
 
                     /* Growing may move things, so have to save and recalculate
                      * 'd' */
@@ -4568,7 +4805,8 @@ PP(pp_fc)
                     SvCUR_set(dest, len);
                     len = sv_utf8_upgrade_flags_grow(dest,
                                                 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                                extra);
+                                                extra
+                                              + 1 /* Trailing NUL */ );
                     d = (U8*)SvPVX(dest) + len;
 
                     *d++ = UTF8_TWO_BYTE_HI(GREEK_SMALL_LETTER_MU);