X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/2f8f985a27faf25c5a535cbe67d098690668c0f9..d92a4578eadaba6c3f452ae1b5536979cc3a7999:/pp.c diff --git a/pp.c b/pp.c index 533dcc1..c7fa231 100644 --- a/pp.c +++ b/pp.c @@ -28,6 +28,7 @@ #include "perl.h" #include "keywords.h" +#include "invlist_inline.h" #include "reentr.h" #include "regcharclass.h" @@ -1693,7 +1694,8 @@ PP(pp_repeat) else { dTOPss; ASSUME(MARK + 1 == SP); - XPUSHs(sv); + MEXTEND(SP, 1); + PUSHs(sv); MARK[1] = &PL_sv_undef; } SP = MARK + 2; @@ -2052,7 +2054,7 @@ PP(pp_lt) dSP; SV *left, *right; - tryAMAGICbin_MG(lt_amg, AMGf_set|AMGf_numeric); + tryAMAGICbin_MG(lt_amg, AMGf_numeric); right = POPs; left = TOPs; SETs(boolSV( @@ -2068,7 +2070,7 @@ PP(pp_gt) dSP; SV *left, *right; - tryAMAGICbin_MG(gt_amg, AMGf_set|AMGf_numeric); + tryAMAGICbin_MG(gt_amg, AMGf_numeric); right = POPs; left = TOPs; SETs(boolSV( @@ -2084,7 +2086,7 @@ PP(pp_le) dSP; SV *left, *right; - tryAMAGICbin_MG(le_amg, AMGf_set|AMGf_numeric); + tryAMAGICbin_MG(le_amg, AMGf_numeric); right = POPs; left = TOPs; SETs(boolSV( @@ -2100,7 +2102,7 @@ PP(pp_ge) dSP; SV *left, *right; - tryAMAGICbin_MG(ge_amg, AMGf_set|AMGf_numeric); + tryAMAGICbin_MG(ge_amg, AMGf_numeric); right = POPs; left = TOPs; SETs(boolSV( @@ -2116,7 +2118,7 @@ PP(pp_ne) dSP; SV *left, *right; - tryAMAGICbin_MG(ne_amg, AMGf_set|AMGf_numeric); + tryAMAGICbin_MG(ne_amg, AMGf_numeric); right = POPs; left = TOPs; SETs(boolSV( @@ -2248,7 +2250,7 @@ PP(pp_sle) break; } - tryAMAGICbin_MG(amg_type, AMGf_set); + tryAMAGICbin_MG(amg_type, 0); { dPOPTOPssrl; const int cmp = @@ -2266,7 +2268,7 @@ PP(pp_sle) PP(pp_seq) { dSP; - tryAMAGICbin_MG(seq_amg, AMGf_set); + tryAMAGICbin_MG(seq_amg, 0); { dPOPTOPssrl; SETs(boolSV(sv_eq_flags(left, right, 0))); @@ -2277,7 +2279,7 @@ PP(pp_seq) PP(pp_sne) { dSP; - tryAMAGICbin_MG(sne_amg, AMGf_set); + tryAMAGICbin_MG(sne_amg, 0); { dPOPTOPssrl; SETs(boolSV(!sv_eq_flags(left, right, 0))); @@ -2512,7 +2514,7 @@ PP(pp_not) dSP; SV *sv; - tryAMAGICun_MG(not_amg, AMGf_set); + tryAMAGICun_MG(not_amg, 0); sv = *PL_stack_sp; *PL_stack_sp = boolSV(!SvTRUE_nomg_NN(sv)); return NORMAL; @@ -2709,7 +2711,7 @@ PP(pp_i_subtract) PP(pp_i_lt) { dSP; - tryAMAGICbin_MG(lt_amg, AMGf_set); + tryAMAGICbin_MG(lt_amg, 0); { dPOPTOPiirl_nomg; SETs(boolSV(left < right)); @@ -2720,7 +2722,7 @@ PP(pp_i_lt) PP(pp_i_gt) { dSP; - tryAMAGICbin_MG(gt_amg, AMGf_set); + tryAMAGICbin_MG(gt_amg, 0); { dPOPTOPiirl_nomg; SETs(boolSV(left > right)); @@ -2731,7 +2733,7 @@ PP(pp_i_gt) PP(pp_i_le) { dSP; - tryAMAGICbin_MG(le_amg, AMGf_set); + tryAMAGICbin_MG(le_amg, 0); { dPOPTOPiirl_nomg; SETs(boolSV(left <= right)); @@ -2742,7 +2744,7 @@ PP(pp_i_le) PP(pp_i_ge) { dSP; - tryAMAGICbin_MG(ge_amg, AMGf_set); + tryAMAGICbin_MG(ge_amg, 0); { dPOPTOPiirl_nomg; SETs(boolSV(left >= right)); @@ -2753,7 +2755,7 @@ PP(pp_i_ge) PP(pp_i_eq) { dSP; - tryAMAGICbin_MG(eq_amg, AMGf_set); + tryAMAGICbin_MG(eq_amg, 0); { dPOPTOPiirl_nomg; SETs(boolSV(left == right)); @@ -2764,7 +2766,7 @@ PP(pp_i_eq) PP(pp_i_ne) { dSP; - tryAMAGICbin_MG(ne_amg, AMGf_set); + tryAMAGICbin_MG(ne_amg, 0); { dPOPTOPiirl_nomg; SETs(boolSV(left != right)); @@ -3708,6 +3710,7 @@ PP(pp_ucfirst) STRLEN tculen; /* tculen is the byte length of the freshly titlecased (or * lowercased) character stored in tmpbuf. May be either * UTF-8 or not, but in either case is the number of bytes */ + bool remove_dot_above = FALSE; s = (const U8*)SvPV_const(source, slen); @@ -3748,7 +3751,37 @@ PP(pp_ucfirst) #ifdef USE_LOCALE_CTYPE _toLOWER_utf8_flags(s, s + slen, tmpbuf, &tculen, IN_LC_RUNTIME(LC_CTYPE)); + + /* In turkic locales, lower casing an 'I' normally yields U+0131, + * LATIN SMALL LETTER DOTLESS I, but not if the grapheme also + * contains a COMBINING DOT ABOVE. Instead it is treated like + * LATIN CAPITAL LETTER I WITH DOT ABOVE lowercased to 'i'. The + * call to lowercase above has handled this. But SpecialCasing.txt + * says we are supposed to remove the COMBINING DOT ABOVE. We can + * tell if we have this situation if I ==> i in a turkic locale. */ + if ( UNLIKELY(PL_in_utf8_turkic_locale) + && IN_LC_RUNTIME(LC_CTYPE) + && (UNLIKELY(*s == 'I' && tmpbuf[0] == 'i'))) + { + /* Here, we know there was a COMBINING DOT ABOVE. We won't be + * able to handle this in-place. */ + inplace = FALSE; + + /* It seems likely that the DOT will immediately follow the + * 'I'. If so, we can remove it simply by indicating to the + * code below to start copying the source just beyond the DOT. + * We know its length is 2 */ + if (LIKELY(memBEGINs(s + 1, s + slen, COMBINING_DOT_ABOVE_UTF8))) { + ulen += 2; + } + else { /* But if it doesn't follow immediately, set a flag for + the code below */ + remove_dot_above = TRUE; + } + } #else + PERL_UNUSED_VAR(remove_dot_above); + _toLOWER_utf8_flags(s, s + slen, tmpbuf, &tculen, 0); #endif @@ -3766,41 +3799,63 @@ PP(pp_ucfirst) * need to be overridden for the tricky ones */ need = slen + 1; - if (op_type == OP_LCFIRST) { - /* lower case the first letter: no trickiness for any character */ #ifdef USE_LOCALE_CTYPE - if (IN_LC_RUNTIME(LC_CTYPE)) { - *tmpbuf = toLOWER_LC(*s); - } - else -#endif + + if (IN_LC_RUNTIME(LC_CTYPE)) { + if ( UNLIKELY(PL_in_utf8_turkic_locale) + && ( (op_type == OP_LCFIRST && UNLIKELY(*s == 'I')) + || (op_type == OP_UCFIRST && UNLIKELY(*s == 'i')))) { - *tmpbuf = (IN_UNI_8_BIT) - ? toLOWER_LATIN1(*s) - : toLOWER(*s); + if (*s == 'I') { /* lcfirst('I') */ + tmpbuf[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I); + tmpbuf[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I); + } + else { /* ucfirst('i') */ + tmpbuf[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + tmpbuf[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + } + tculen = 2; + inplace = FALSE; + doing_utf8 = TRUE; + convert_source_to_utf8 = TRUE; + need += variant_under_utf8_count(s, s + slen); } - } -#ifdef USE_LOCALE_CTYPE - /* is ucfirst() */ - else if (IN_LC_RUNTIME(LC_CTYPE)) { - if (IN_UTF8_CTYPE_LOCALE) { - goto do_uni_rules; + else if (op_type == OP_LCFIRST) { + + /* For lc, there are no gotchas for UTF-8 locales (other than + * the turkish ones already handled above) */ + *tmpbuf = toLOWER_LC(*s); } + else { /* ucfirst */ - *tmpbuf = (U8) toUPPER_LC(*s); /* This would be a bug if any - locales have upper and title case - different */ - } + /* But for uc, some characters require special handling */ + if (IN_UTF8_CTYPE_LOCALE) { + goto do_uni_rules; + } + + /* This would be a bug if any locales have upper and title case + * different */ + *tmpbuf = (U8) toUPPER_LC(*s); + } + } + else #endif - else if (! IN_UNI_8_BIT) { - *tmpbuf = toUPPER(*s); /* Returns caseless for non-ascii, or - * on EBCDIC machines whatever the - * native function does */ - } + /* Here, not in locale. If not using Unicode rules, is a simple + * lower/upper, depending */ + if (! IN_UNI_8_BIT) { + *tmpbuf = (op_type == OP_LCFIRST) + ? toLOWER(*s) + : toUPPER(*s); + } + else if (op_type == OP_LCFIRST) { + /* lower case the first letter: no trickiness for any character */ + *tmpbuf = toLOWER_LATIN1(*s); + } else { /* Here, is ucfirst non-UTF-8, not in locale (unless that locale is - * UTF-8, which we treat as not in locale), and cased latin1 */ + * non-turkic UTF-8, which we treat as not in locale), and cased + * latin1 */ UV title_ord; #ifdef USE_LOCALE_CTYPE do_uni_rules: @@ -3836,7 +3891,7 @@ PP(pp_ucfirst) + 1; /* The (converted) UTF-8 and UTF-EBCDIC lengths of all - * (both) characters whose title case is above 255 is + * characters whose title case is above 255 is * 2. */ ulen = 2; } @@ -3880,6 +3935,29 @@ PP(pp_ucfirst) * of the string. */ sv_setpvn(dest, (char*)tmpbuf, tculen); if (slen > ulen) { + + /* But this boolean being set means we are in a turkic + * locale, and there is a DOT character that needs to be + * removed, and it isn't immediately after the current + * character. Keep concatenating characters to the output + * one at a time, until we find the DOT, which we simply + * skip */ + if (UNLIKELY(remove_dot_above)) { + do { + Size_t this_len = UTF8SKIP(s + ulen); + + sv_catpvn(dest, (char*)(s + ulen), this_len); + + ulen += this_len; + if (memBEGINs(s + ulen, s + slen, COMBINING_DOT_ABOVE_UTF8)) { + ulen += 2; + break; + } + } while (s + ulen < s + slen); + } + + /* The rest of the string can be concatenated unchanged, + * all at once */ sv_catpvn(dest, (char*)(s + ulen), slen - ulen); } } @@ -3891,7 +3969,7 @@ PP(pp_ucfirst) * into tmpbuf. First put that into dest, and then append the * rest of the source, converting it to UTF-8 as we go. */ - /* Assert tculen is 2 here because the only two characters that + /* Assert tculen is 2 here because the only characters that * get to this part of the code have 2-byte UTF-8 equivalents */ assert(tculen == 2); *d++ = *tmpbuf; @@ -3957,6 +4035,7 @@ PP(pp_ucfirst) PP(pp_uc) { + dVAR; dSP; SV *source = TOPs; STRLEN len; @@ -4039,12 +4118,16 @@ PP(pp_uc) STRLEN u; STRLEN ulen; UV uv; - if (in_iota_subscript && ! _is_utf8_mark(s)) { + if (UNLIKELY(in_iota_subscript)) { + UV cp = utf8_to_uvchr_buf(s, send, NULL); + + if (! _invlist_contains_cp(PL_utf8_mark, cp)) { - /* A non-mark. Time to output the iota subscript */ - *d++ = UTF8_TWO_BYTE_HI(GREEK_CAPITAL_LETTER_IOTA); - *d++ = UTF8_TWO_BYTE_LO(GREEK_CAPITAL_LETTER_IOTA); - in_iota_subscript = FALSE; + /* A non-mark. Time to output the iota subscript */ + *d++ = UTF8_TWO_BYTE_HI(GREEK_CAPITAL_LETTER_IOTA); + *d++ = UTF8_TWO_BYTE_LO(GREEK_CAPITAL_LETTER_IOTA); + in_iota_subscript = FALSE; + } } /* Then handle the current character. Get the changed case value @@ -4119,15 +4202,24 @@ PP(pp_uc) Size_t extra; *d = toUPPER_LATIN1_MOD(*s); - if (LIKELY(*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + if ( LIKELY(*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) + +#ifdef USE_LOCALE_CTYPE + + && (LIKELY( ! PL_in_utf8_turkic_locale + || ! IN_LC_RUNTIME(LC_CTYPE)) + || *s != 'i') +#endif + + ) { continue; } /* The mainstream case is the tight loop above. To avoid - * extra tests in that, all three characters that require - * special handling are mapped by the MOD to the one tested - * just above. - * Use the source to distinguish between the three cases */ + * extra tests in that, all three characters that always + * require special handling are mapped by the MOD to the + * one tested just above. Use the source to distinguish + * between those cases */ #if UNICODE_MAJOR_VERSION > 2 \ || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \ @@ -4145,7 +4237,7 @@ PP(pp_uc) } #endif - /* The other two special handling characters have their + /* The other special handling characters have their * upper cases outside the latin1 range, hence need to be * in UTF-8, so the whole result needs to be in UTF-8. * @@ -4171,14 +4263,32 @@ PP(pp_uc) * not require much extra code. * * First, calculate the extra space needed for the - * remainder of the source needing to be in UTF-8. The + * remainder of the source needing to be in UTF-8. Except + * for the 'i' in Turkic locales, in UTF-8 strings, the * uppercase of a character below 256 occupies the same * number of bytes as the original. Therefore, the space * needed is the that number plus the number of characters - * that become two bytes when converted to UTF-8. */ + * that become two bytes when converted to UTF-8, plus, in + * turkish locales, the number of 'i's. */ extra = send - s + variant_under_utf8_count(s, send); +#ifdef USE_LOCALE_CTYPE + + if (UNLIKELY(*s == 'i')) { /* We wouldn't get an 'i' here + unless are in a Turkic + locale */ + const U8 * s_peek = s; + + do { + extra++; + + s_peek = (U8 *) memchr(s_peek + 1, 'i', + send - (s_peek + 1)); + } while (s_peek != NULL); + } +#endif + /* Convert what we have so far into UTF-8, telling the * function that we know it should be converted, and to * allow extra space for what we haven't processed yet. @@ -4190,20 +4300,42 @@ PP(pp_uc) SvCUR_set(dest, len); len = sv_utf8_upgrade_flags_grow(dest, SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - extra); + extra + + 1 /* trailing NUL */ ); d = (U8*)SvPVX(dest) + len; /* Now process the remainder of the source, simultaneously - * converting to upper and UTF-8. */ - for (; s < send; s++) { - (void) _to_upper_title_latin1(*s, d, &len, 'S'); - d += len; - } - - /* Here have processed the whole source; no need to continue - * with the outer loop. Each character has been converted - * to upper case and converted to UTF-8 */ + * converting to upper and UTF-8. + * + * To avoid extra tests in the loop body, and since the + * loop is so simple, split out the rare Turkic case into + * its own loop */ +#ifdef USE_LOCALE_CTYPE + if ( UNLIKELY(PL_in_utf8_turkic_locale) + && UNLIKELY(IN_LC_RUNTIME(LC_CTYPE))) + { + for (; s < send; s++) { + if (*s == 'i') { + *d++ = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + *d++ = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + } + else { + (void) _to_upper_title_latin1(*s, d, &len, 'S'); + d += len; + } + } + } + else +#endif + for (; s < send; s++) { + (void) _to_upper_title_latin1(*s, d, &len, 'S'); + d += len; + } + + /* Here have processed the whole source; no need to + * continue with the outer loop. Each character has been + * converted to upper case and converted to UTF-8. */ break; } /* End of processing all latin1-style chars */ } /* End of processing all chars */ @@ -4235,15 +4367,26 @@ PP(pp_lc) SV *dest; const U8 *s; U8 *d; + bool has_turkic_I = FALSE; SvGETMAGIC(source); if ( SvPADTMP(source) && !SvREADONLY(source) && SvPOK(source) - && !DO_UTF8(source)) { + && !DO_UTF8(source) + +#ifdef USE_LOCALE_CTYPE + + && ( LIKELY(! IN_LC_RUNTIME(LC_CTYPE)) + || LIKELY(! PL_in_utf8_turkic_locale)) - /* We can convert in place, as lowercasing anything in the latin1 range - * (or else DO_UTF8 would have been on) doesn't lengthen it */ +#endif + + ) { + + /* We can convert in place, as, outside of Turkic UTF-8 locales, + * lowercasing anything in the latin1 range (or else DO_UTF8 would have + * been on) doesn't lengthen it. */ dest = source; s = d = (U8*)SvPV_force_nomg(source, len); min = len + 1; @@ -4265,7 +4408,38 @@ PP(pp_lc) #ifdef USE_LOCALE_CTYPE if (IN_LC_RUNTIME(LC_CTYPE)) { + const U8 * next_I; + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + + /* Lowercasing in a Turkic locale can cause non-UTF-8 to need to become + * UTF-8 for the single case of the character 'I' */ + if ( UNLIKELY(PL_in_utf8_turkic_locale) + && ! DO_UTF8(source) + && (next_I = (U8 *) memchr(s, 'I', len))) + { + Size_t I_count = 0; + const U8 *const send = s + len; + + do { + I_count++; + + next_I = (U8 *) memchr(next_I + 1, 'I', + send - (next_I + 1)); + } while (next_I != NULL); + + /* Except for the 'I', in UTF-8 strings, the lower case of a + * character below 256 occupies the same number of bytes as the + * original. Therefore, the space needed is the original length + * plus I_count plus the number of characters that become two bytes + * when converted to UTF-8 */ + sv_utf8_upgrade_flags_grow(dest, 0, len + + I_count + + variant_under_utf8_count(s, send) + + 1 /* Trailing NUL */ ); + d = (U8*)SvPVX(dest); + has_turkic_I = TRUE; + } } #endif @@ -4276,6 +4450,7 @@ PP(pp_lc) if (DO_UTF8(source)) { const U8 *const send = s + len; U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; + bool remove_dot_above = FALSE; while (s < send) { const STRLEN u = UTF8SKIP(s); @@ -4284,7 +4459,33 @@ PP(pp_lc) #ifdef USE_LOCALE_CTYPE _toLOWER_utf8_flags(s, send, tmpbuf, &ulen, IN_LC_RUNTIME(LC_CTYPE)); + + /* If we are in a Turkic locale, we have to do more work. As noted + * in the comments for lcfirst, there is a special case if a 'I' + * is in a grapheme with COMBINING DOT ABOVE UTF8. It turns into a + * 'i', and the DOT must be removed. We check for that situation, + * and set a flag if the DOT is there. Then each time through the + * loop, we have to see if we need to remove the next DOT above, + * and if so, do it. We know that there is a DOT because + * _toLOWER_utf8_flags() wouldn't have returned 'i' unless there + * was one in a proper position. */ + if ( UNLIKELY(PL_in_utf8_turkic_locale) + && IN_LC_RUNTIME(LC_CTYPE)) + { + if ( UNLIKELY(remove_dot_above) + && memBEGINs(tmpbuf, sizeof(tmpbuf), COMBINING_DOT_ABOVE_UTF8)) + { + s += u; + remove_dot_above = FALSE; + continue; + } + else if (UNLIKELY(*s == 'I' && tmpbuf[0] == 'i')) { + remove_dot_above = TRUE; + } + } #else + PERL_UNUSED_VAR(remove_dot_above); + _toLOWER_utf8_flags(s, send, tmpbuf, &ulen, 0); #endif @@ -4316,7 +4517,7 @@ PP(pp_lc) SvUTF8_on(dest); *d = '\0'; SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); - } else { /* Not utf8 */ + } else { /* 'source' not utf8 */ if (len) { const U8 *const send = s + len; @@ -4325,8 +4526,22 @@ PP(pp_lc) * whole thing in a tight loop, for speed, */ #ifdef USE_LOCALE_CTYPE if (IN_LC_RUNTIME(LC_CTYPE)) { - for (; s < send; d++, s++) - *d = toLOWER_LC(*s); + if (LIKELY( ! has_turkic_I)) { + for (; s < send; d++, s++) + *d = toLOWER_LC(*s); + } + else { /* This is the only case where lc() converts 'dest' + into UTF-8 from a non-UTF-8 'source' */ + for (; s < send; s++) { + if (*s == 'I') { + *d++ = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I); + *d++ = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I); + } + else { + append_utf8_from_native_byte(toLOWER_LATIN1(*s), &d); + } + } + } } else #endif @@ -4534,33 +4749,55 @@ PP(pp_fc) #ifdef USE_LOCALE_CTYPE do_uni_folding: #endif - /* For ASCII and the Latin-1 range, there's two + /* For ASCII and the Latin-1 range, there's potentially three * troublesome folds: * \x{DF} (\N{LATIN SMALL LETTER SHARP S}), which under full * casefolding becomes 'ss'; * \x{B5} (\N{MICRO SIGN}), which under any fold becomes * \x{3BC} (\N{GREEK SMALL LETTER MU}) + * I only in Turkic locales, this folds to \x{131} + * \N{LATIN SMALL LETTER DOTLESS I} * For the rest, the casefold is their lowercase. */ for (; s < send; d++, s++) { - if (*s == MICRO_SIGN) { + if ( UNLIKELY(*s == MICRO_SIGN) +#ifdef USE_LOCALE_CTYPE + || ( UNLIKELY(PL_in_utf8_turkic_locale) + && UNLIKELY(IN_LC_RUNTIME(LC_CTYPE)) + && UNLIKELY(*s == 'I')) +#endif + ) { Size_t extra = send - s + variant_under_utf8_count(s, send); /* \N{MICRO SIGN}'s casefold is \N{GREEK SMALL LETTER MU}, - * which is outside of the latin-1 range. There's a couple - * of ways to deal with this -- khw discusses them in - * pp_lc/uc, so go there :) What we do here is upgrade what - * we had already casefolded, then enter an inner loop that - * appends the rest of the characters as UTF-8. + * and 'I' in Turkic locales is \N{LATIN SMALL LETTER + * DOTLESS I} both of which are outside of the latin-1 + * range. There's a couple of ways to deal with this -- khw + * discusses them in pp_lc/uc, so go there :) What we do + * here is upgrade what we had already casefolded, then + * enter an inner loop that appends the rest of the + * characters as UTF-8. * * First we calculate the needed size of the upgraded dest * beyond what's been processed already (the upgrade - * function figures that out). In UTF-8 strings, the fold case of a + * function figures that out). Except for the 'I' in + * Turkic locales, in UTF-8 strings, the fold case of a * character below 256 occupies the same number of bytes as * the original (even the Sharp S). Therefore, the space * needed is the number of bytes remaining plus the number * of characters that become two bytes when converted to - * UTF-8. */ + * UTF-8 plus, in turkish locales, the number of 'I's */ + + if (UNLIKELY(*s == 'I')) { + const U8 * s_peek = s; + + do { + extra++; + + s_peek = (U8 *) memchr(s_peek + 1, 'i', + send - (s_peek + 1)); + } while (s_peek != NULL); + } /* Growing may move things, so have to save and recalculate * 'd' */ @@ -4568,7 +4805,8 @@ PP(pp_fc) SvCUR_set(dest, len); len = sv_utf8_upgrade_flags_grow(dest, SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - extra); + extra + + 1 /* Trailing NUL */ ); d = (U8*)SvPVX(dest) + len; *d++ = UTF8_TWO_BYTE_HI(GREEK_SMALL_LETTER_MU);