#include "perl.h"
#include "keywords.h"
+#include "invlist_inline.h"
#include "reentr.h"
#include "regcharclass.h"
else {
dTOPss;
ASSUME(MARK + 1 == SP);
- XPUSHs(sv);
+ MEXTEND(SP, 1);
+ PUSHs(sv);
MARK[1] = &PL_sv_undef;
}
SP = MARK + 2;
dSP;
SV *left, *right;
- tryAMAGICbin_MG(lt_amg, AMGf_set|AMGf_numeric);
+ tryAMAGICbin_MG(lt_amg, AMGf_numeric);
right = POPs;
left = TOPs;
SETs(boolSV(
dSP;
SV *left, *right;
- tryAMAGICbin_MG(gt_amg, AMGf_set|AMGf_numeric);
+ tryAMAGICbin_MG(gt_amg, AMGf_numeric);
right = POPs;
left = TOPs;
SETs(boolSV(
dSP;
SV *left, *right;
- tryAMAGICbin_MG(le_amg, AMGf_set|AMGf_numeric);
+ tryAMAGICbin_MG(le_amg, AMGf_numeric);
right = POPs;
left = TOPs;
SETs(boolSV(
dSP;
SV *left, *right;
- tryAMAGICbin_MG(ge_amg, AMGf_set|AMGf_numeric);
+ tryAMAGICbin_MG(ge_amg, AMGf_numeric);
right = POPs;
left = TOPs;
SETs(boolSV(
dSP;
SV *left, *right;
- tryAMAGICbin_MG(ne_amg, AMGf_set|AMGf_numeric);
+ tryAMAGICbin_MG(ne_amg, AMGf_numeric);
right = POPs;
left = TOPs;
SETs(boolSV(
break;
}
- tryAMAGICbin_MG(amg_type, AMGf_set);
+ tryAMAGICbin_MG(amg_type, 0);
{
dPOPTOPssrl;
const int cmp =
PP(pp_seq)
{
dSP;
- tryAMAGICbin_MG(seq_amg, AMGf_set);
+ tryAMAGICbin_MG(seq_amg, 0);
{
dPOPTOPssrl;
SETs(boolSV(sv_eq_flags(left, right, 0)));
PP(pp_sne)
{
dSP;
- tryAMAGICbin_MG(sne_amg, AMGf_set);
+ tryAMAGICbin_MG(sne_amg, 0);
{
dPOPTOPssrl;
SETs(boolSV(!sv_eq_flags(left, right, 0)));
dSP;
SV *sv;
- tryAMAGICun_MG(not_amg, AMGf_set);
+ tryAMAGICun_MG(not_amg, 0);
sv = *PL_stack_sp;
*PL_stack_sp = boolSV(!SvTRUE_nomg_NN(sv));
return NORMAL;
PP(pp_i_lt)
{
dSP;
- tryAMAGICbin_MG(lt_amg, AMGf_set);
+ tryAMAGICbin_MG(lt_amg, 0);
{
dPOPTOPiirl_nomg;
SETs(boolSV(left < right));
PP(pp_i_gt)
{
dSP;
- tryAMAGICbin_MG(gt_amg, AMGf_set);
+ tryAMAGICbin_MG(gt_amg, 0);
{
dPOPTOPiirl_nomg;
SETs(boolSV(left > right));
PP(pp_i_le)
{
dSP;
- tryAMAGICbin_MG(le_amg, AMGf_set);
+ tryAMAGICbin_MG(le_amg, 0);
{
dPOPTOPiirl_nomg;
SETs(boolSV(left <= right));
PP(pp_i_ge)
{
dSP;
- tryAMAGICbin_MG(ge_amg, AMGf_set);
+ tryAMAGICbin_MG(ge_amg, 0);
{
dPOPTOPiirl_nomg;
SETs(boolSV(left >= right));
PP(pp_i_eq)
{
dSP;
- tryAMAGICbin_MG(eq_amg, AMGf_set);
+ tryAMAGICbin_MG(eq_amg, 0);
{
dPOPTOPiirl_nomg;
SETs(boolSV(left == right));
PP(pp_i_ne)
{
dSP;
- tryAMAGICbin_MG(ne_amg, AMGf_set);
+ tryAMAGICbin_MG(ne_amg, 0);
{
dPOPTOPiirl_nomg;
SETs(boolSV(left != right));
STRLEN tculen; /* tculen is the byte length of the freshly titlecased (or
* lowercased) character stored in tmpbuf. May be either
* UTF-8 or not, but in either case is the number of bytes */
+ bool remove_dot_above = FALSE;
s = (const U8*)SvPV_const(source, slen);
#ifdef USE_LOCALE_CTYPE
_toLOWER_utf8_flags(s, s + slen, tmpbuf, &tculen, IN_LC_RUNTIME(LC_CTYPE));
+
+ /* In turkic locales, lower casing an 'I' normally yields U+0131,
+ * LATIN SMALL LETTER DOTLESS I, but not if the grapheme also
+ * contains a COMBINING DOT ABOVE. Instead it is treated like
+ * LATIN CAPITAL LETTER I WITH DOT ABOVE lowercased to 'i'. The
+ * call to lowercase above has handled this. But SpecialCasing.txt
+ * says we are supposed to remove the COMBINING DOT ABOVE. We can
+ * tell if we have this situation if I ==> i in a turkic locale. */
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && IN_LC_RUNTIME(LC_CTYPE)
+ && (UNLIKELY(*s == 'I' && tmpbuf[0] == 'i')))
+ {
+ /* Here, we know there was a COMBINING DOT ABOVE. We won't be
+ * able to handle this in-place. */
+ inplace = FALSE;
+
+ /* It seems likely that the DOT will immediately follow the
+ * 'I'. If so, we can remove it simply by indicating to the
+ * code below to start copying the source just beyond the DOT.
+ * We know its length is 2 */
+ if (LIKELY(memBEGINs(s + 1, s + slen, COMBINING_DOT_ABOVE_UTF8))) {
+ ulen += 2;
+ }
+ else { /* But if it doesn't follow immediately, set a flag for
+ the code below */
+ remove_dot_above = TRUE;
+ }
+ }
#else
+ PERL_UNUSED_VAR(remove_dot_above);
+
_toLOWER_utf8_flags(s, s + slen, tmpbuf, &tculen, 0);
#endif
* need to be overridden for the tricky ones */
need = slen + 1;
- if (op_type == OP_LCFIRST) {
- /* lower case the first letter: no trickiness for any character */
#ifdef USE_LOCALE_CTYPE
- if (IN_LC_RUNTIME(LC_CTYPE)) {
- *tmpbuf = toLOWER_LC(*s);
- }
- else
-#endif
+
+ if (IN_LC_RUNTIME(LC_CTYPE)) {
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && ( (op_type == OP_LCFIRST && UNLIKELY(*s == 'I'))
+ || (op_type == OP_UCFIRST && UNLIKELY(*s == 'i'))))
{
- *tmpbuf = (IN_UNI_8_BIT)
- ? toLOWER_LATIN1(*s)
- : toLOWER(*s);
+ if (*s == 'I') { /* lcfirst('I') */
+ tmpbuf[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
+ tmpbuf[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
+ }
+ else { /* ucfirst('i') */
+ tmpbuf[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ tmpbuf[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ }
+ tculen = 2;
+ inplace = FALSE;
+ doing_utf8 = TRUE;
+ convert_source_to_utf8 = TRUE;
+ need += variant_under_utf8_count(s, s + slen);
}
- }
-#ifdef USE_LOCALE_CTYPE
- /* is ucfirst() */
- else if (IN_LC_RUNTIME(LC_CTYPE)) {
- if (IN_UTF8_CTYPE_LOCALE) {
- goto do_uni_rules;
+ else if (op_type == OP_LCFIRST) {
+
+ /* For lc, there are no gotchas for UTF-8 locales (other than
+ * the turkish ones already handled above) */
+ *tmpbuf = toLOWER_LC(*s);
}
+ else { /* ucfirst */
- *tmpbuf = (U8) toUPPER_LC(*s); /* This would be a bug if any
- locales have upper and title case
- different */
- }
+ /* But for uc, some characters require special handling */
+ if (IN_UTF8_CTYPE_LOCALE) {
+ goto do_uni_rules;
+ }
+
+ /* This would be a bug if any locales have upper and title case
+ * different */
+ *tmpbuf = (U8) toUPPER_LC(*s);
+ }
+ }
+ else
#endif
- else if (! IN_UNI_8_BIT) {
- *tmpbuf = toUPPER(*s); /* Returns caseless for non-ascii, or
- * on EBCDIC machines whatever the
- * native function does */
- }
+ /* Here, not in locale. If not using Unicode rules, is a simple
+ * lower/upper, depending */
+ if (! IN_UNI_8_BIT) {
+ *tmpbuf = (op_type == OP_LCFIRST)
+ ? toLOWER(*s)
+ : toUPPER(*s);
+ }
+ else if (op_type == OP_LCFIRST) {
+ /* lower case the first letter: no trickiness for any character */
+ *tmpbuf = toLOWER_LATIN1(*s);
+ }
else {
/* Here, is ucfirst non-UTF-8, not in locale (unless that locale is
- * UTF-8, which we treat as not in locale), and cased latin1 */
+ * non-turkic UTF-8, which we treat as not in locale), and cased
+ * latin1 */
UV title_ord;
#ifdef USE_LOCALE_CTYPE
do_uni_rules:
+ 1;
/* The (converted) UTF-8 and UTF-EBCDIC lengths of all
- * (both) characters whose title case is above 255 is
+ * characters whose title case is above 255 is
* 2. */
ulen = 2;
}
* of the string. */
sv_setpvn(dest, (char*)tmpbuf, tculen);
if (slen > ulen) {
+
+ /* But this boolean being set means we are in a turkic
+ * locale, and there is a DOT character that needs to be
+ * removed, and it isn't immediately after the current
+ * character. Keep concatenating characters to the output
+ * one at a time, until we find the DOT, which we simply
+ * skip */
+ if (UNLIKELY(remove_dot_above)) {
+ do {
+ Size_t this_len = UTF8SKIP(s + ulen);
+
+ sv_catpvn(dest, (char*)(s + ulen), this_len);
+
+ ulen += this_len;
+ if (memBEGINs(s + ulen, s + slen, COMBINING_DOT_ABOVE_UTF8)) {
+ ulen += 2;
+ break;
+ }
+ } while (s + ulen < s + slen);
+ }
+
+ /* The rest of the string can be concatenated unchanged,
+ * all at once */
sv_catpvn(dest, (char*)(s + ulen), slen - ulen);
}
}
* into tmpbuf. First put that into dest, and then append the
* rest of the source, converting it to UTF-8 as we go. */
- /* Assert tculen is 2 here because the only two characters that
+ /* Assert tculen is 2 here because the only characters that
* get to this part of the code have 2-byte UTF-8 equivalents */
assert(tculen == 2);
*d++ = *tmpbuf;
PP(pp_uc)
{
+ dVAR;
dSP;
SV *source = TOPs;
STRLEN len;
STRLEN u;
STRLEN ulen;
UV uv;
- if (in_iota_subscript && ! _is_utf8_mark(s)) {
+ if (UNLIKELY(in_iota_subscript)) {
+ UV cp = utf8_to_uvchr_buf(s, send, NULL);
+
+ if (! _invlist_contains_cp(PL_utf8_mark, cp)) {
- /* A non-mark. Time to output the iota subscript */
- *d++ = UTF8_TWO_BYTE_HI(GREEK_CAPITAL_LETTER_IOTA);
- *d++ = UTF8_TWO_BYTE_LO(GREEK_CAPITAL_LETTER_IOTA);
- in_iota_subscript = FALSE;
+ /* A non-mark. Time to output the iota subscript */
+ *d++ = UTF8_TWO_BYTE_HI(GREEK_CAPITAL_LETTER_IOTA);
+ *d++ = UTF8_TWO_BYTE_LO(GREEK_CAPITAL_LETTER_IOTA);
+ in_iota_subscript = FALSE;
+ }
}
/* Then handle the current character. Get the changed case value
Size_t extra;
*d = toUPPER_LATIN1_MOD(*s);
- if (LIKELY(*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
+ if ( LIKELY(*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)
+
+#ifdef USE_LOCALE_CTYPE
+
+ && (LIKELY( ! PL_in_utf8_turkic_locale
+ || ! IN_LC_RUNTIME(LC_CTYPE))
+ || *s != 'i')
+#endif
+
+ ) {
continue;
}
/* The mainstream case is the tight loop above. To avoid
- * extra tests in that, all three characters that require
- * special handling are mapped by the MOD to the one tested
- * just above.
- * Use the source to distinguish between the three cases */
+ * extra tests in that, all three characters that always
+ * require special handling are mapped by the MOD to the
+ * one tested just above. Use the source to distinguish
+ * between those cases */
#if UNICODE_MAJOR_VERSION > 2 \
|| (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
}
#endif
- /* The other two special handling characters have their
+ /* The other special handling characters have their
* upper cases outside the latin1 range, hence need to be
* in UTF-8, so the whole result needs to be in UTF-8.
*
* not require much extra code.
*
* First, calculate the extra space needed for the
- * remainder of the source needing to be in UTF-8. The
+ * remainder of the source needing to be in UTF-8. Except
+ * for the 'i' in Turkic locales, in UTF-8 strings, the
* uppercase of a character below 256 occupies the same
* number of bytes as the original. Therefore, the space
* needed is the that number plus the number of characters
- * that become two bytes when converted to UTF-8. */
+ * that become two bytes when converted to UTF-8, plus, in
+ * turkish locales, the number of 'i's. */
extra = send - s + variant_under_utf8_count(s, send);
+#ifdef USE_LOCALE_CTYPE
+
+ if (UNLIKELY(*s == 'i')) { /* We wouldn't get an 'i' here
+ unless are in a Turkic
+ locale */
+ const U8 * s_peek = s;
+
+ do {
+ extra++;
+
+ s_peek = (U8 *) memchr(s_peek + 1, 'i',
+ send - (s_peek + 1));
+ } while (s_peek != NULL);
+ }
+#endif
+
/* Convert what we have so far into UTF-8, telling the
* function that we know it should be converted, and to
* allow extra space for what we haven't processed yet.
SvCUR_set(dest, len);
len = sv_utf8_upgrade_flags_grow(dest,
SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
- extra);
+ extra
+ + 1 /* trailing NUL */ );
d = (U8*)SvPVX(dest) + len;
/* Now process the remainder of the source, simultaneously
- * converting to upper and UTF-8. */
- for (; s < send; s++) {
- (void) _to_upper_title_latin1(*s, d, &len, 'S');
- d += len;
- }
-
- /* Here have processed the whole source; no need to continue
- * with the outer loop. Each character has been converted
- * to upper case and converted to UTF-8 */
+ * converting to upper and UTF-8.
+ *
+ * To avoid extra tests in the loop body, and since the
+ * loop is so simple, split out the rare Turkic case into
+ * its own loop */
+#ifdef USE_LOCALE_CTYPE
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && UNLIKELY(IN_LC_RUNTIME(LC_CTYPE)))
+ {
+ for (; s < send; s++) {
+ if (*s == 'i') {
+ *d++ = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ *d++ = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ }
+ else {
+ (void) _to_upper_title_latin1(*s, d, &len, 'S');
+ d += len;
+ }
+ }
+ }
+ else
+#endif
+ for (; s < send; s++) {
+ (void) _to_upper_title_latin1(*s, d, &len, 'S');
+ d += len;
+ }
+
+ /* Here have processed the whole source; no need to
+ * continue with the outer loop. Each character has been
+ * converted to upper case and converted to UTF-8. */
break;
} /* End of processing all latin1-style chars */
} /* End of processing all chars */
SV *dest;
const U8 *s;
U8 *d;
+ bool has_turkic_I = FALSE;
SvGETMAGIC(source);
if ( SvPADTMP(source)
&& !SvREADONLY(source) && SvPOK(source)
- && !DO_UTF8(source)) {
+ && !DO_UTF8(source)
+
+#ifdef USE_LOCALE_CTYPE
+
+ && ( LIKELY(! IN_LC_RUNTIME(LC_CTYPE))
+ || LIKELY(! PL_in_utf8_turkic_locale))
- /* We can convert in place, as lowercasing anything in the latin1 range
- * (or else DO_UTF8 would have been on) doesn't lengthen it */
+#endif
+
+ ) {
+
+ /* We can convert in place, as, outside of Turkic UTF-8 locales,
+ * lowercasing anything in the latin1 range (or else DO_UTF8 would have
+ * been on) doesn't lengthen it. */
dest = source;
s = d = (U8*)SvPV_force_nomg(source, len);
min = len + 1;
#ifdef USE_LOCALE_CTYPE
if (IN_LC_RUNTIME(LC_CTYPE)) {
+ const U8 * next_I;
+
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ /* Lowercasing in a Turkic locale can cause non-UTF-8 to need to become
+ * UTF-8 for the single case of the character 'I' */
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && ! DO_UTF8(source)
+ && (next_I = (U8 *) memchr(s, 'I', len)))
+ {
+ Size_t I_count = 0;
+ const U8 *const send = s + len;
+
+ do {
+ I_count++;
+
+ next_I = (U8 *) memchr(next_I + 1, 'I',
+ send - (next_I + 1));
+ } while (next_I != NULL);
+
+ /* Except for the 'I', in UTF-8 strings, the lower case of a
+ * character below 256 occupies the same number of bytes as the
+ * original. Therefore, the space needed is the original length
+ * plus I_count plus the number of characters that become two bytes
+ * when converted to UTF-8 */
+ sv_utf8_upgrade_flags_grow(dest, 0, len
+ + I_count
+ + variant_under_utf8_count(s, send)
+ + 1 /* Trailing NUL */ );
+ d = (U8*)SvPVX(dest);
+ has_turkic_I = TRUE;
+ }
}
#endif
if (DO_UTF8(source)) {
const U8 *const send = s + len;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+ bool remove_dot_above = FALSE;
while (s < send) {
const STRLEN u = UTF8SKIP(s);
#ifdef USE_LOCALE_CTYPE
_toLOWER_utf8_flags(s, send, tmpbuf, &ulen, IN_LC_RUNTIME(LC_CTYPE));
+
+ /* If we are in a Turkic locale, we have to do more work. As noted
+ * in the comments for lcfirst, there is a special case if a 'I'
+ * is in a grapheme with COMBINING DOT ABOVE UTF8. It turns into a
+ * 'i', and the DOT must be removed. We check for that situation,
+ * and set a flag if the DOT is there. Then each time through the
+ * loop, we have to see if we need to remove the next DOT above,
+ * and if so, do it. We know that there is a DOT because
+ * _toLOWER_utf8_flags() wouldn't have returned 'i' unless there
+ * was one in a proper position. */
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && IN_LC_RUNTIME(LC_CTYPE))
+ {
+ if ( UNLIKELY(remove_dot_above)
+ && memBEGINs(tmpbuf, sizeof(tmpbuf), COMBINING_DOT_ABOVE_UTF8))
+ {
+ s += u;
+ remove_dot_above = FALSE;
+ continue;
+ }
+ else if (UNLIKELY(*s == 'I' && tmpbuf[0] == 'i')) {
+ remove_dot_above = TRUE;
+ }
+ }
#else
+ PERL_UNUSED_VAR(remove_dot_above);
+
_toLOWER_utf8_flags(s, send, tmpbuf, &ulen, 0);
#endif
SvUTF8_on(dest);
*d = '\0';
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
- } else { /* Not utf8 */
+ } else { /* 'source' not utf8 */
if (len) {
const U8 *const send = s + len;
* whole thing in a tight loop, for speed, */
#ifdef USE_LOCALE_CTYPE
if (IN_LC_RUNTIME(LC_CTYPE)) {
- for (; s < send; d++, s++)
- *d = toLOWER_LC(*s);
+ if (LIKELY( ! has_turkic_I)) {
+ for (; s < send; d++, s++)
+ *d = toLOWER_LC(*s);
+ }
+ else { /* This is the only case where lc() converts 'dest'
+ into UTF-8 from a non-UTF-8 'source' */
+ for (; s < send; s++) {
+ if (*s == 'I') {
+ *d++ = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
+ *d++ = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
+ }
+ else {
+ append_utf8_from_native_byte(toLOWER_LATIN1(*s), &d);
+ }
+ }
+ }
}
else
#endif
#ifdef USE_LOCALE_CTYPE
do_uni_folding:
#endif
- /* For ASCII and the Latin-1 range, there's two
+ /* For ASCII and the Latin-1 range, there's potentially three
* troublesome folds:
* \x{DF} (\N{LATIN SMALL LETTER SHARP S}), which under full
* casefolding becomes 'ss';
* \x{B5} (\N{MICRO SIGN}), which under any fold becomes
* \x{3BC} (\N{GREEK SMALL LETTER MU})
+ * I only in Turkic locales, this folds to \x{131}
+ * \N{LATIN SMALL LETTER DOTLESS I}
* For the rest, the casefold is their lowercase. */
for (; s < send; d++, s++) {
- if (*s == MICRO_SIGN) {
+ if ( UNLIKELY(*s == MICRO_SIGN)
+#ifdef USE_LOCALE_CTYPE
+ || ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && UNLIKELY(IN_LC_RUNTIME(LC_CTYPE))
+ && UNLIKELY(*s == 'I'))
+#endif
+ ) {
Size_t extra = send - s
+ variant_under_utf8_count(s, send);
/* \N{MICRO SIGN}'s casefold is \N{GREEK SMALL LETTER MU},
- * which is outside of the latin-1 range. There's a couple
- * of ways to deal with this -- khw discusses them in
- * pp_lc/uc, so go there :) What we do here is upgrade what
- * we had already casefolded, then enter an inner loop that
- * appends the rest of the characters as UTF-8.
+ * and 'I' in Turkic locales is \N{LATIN SMALL LETTER
+ * DOTLESS I} both of which are outside of the latin-1
+ * range. There's a couple of ways to deal with this -- khw
+ * discusses them in pp_lc/uc, so go there :) What we do
+ * here is upgrade what we had already casefolded, then
+ * enter an inner loop that appends the rest of the
+ * characters as UTF-8.
*
* First we calculate the needed size of the upgraded dest
* beyond what's been processed already (the upgrade
- * function figures that out). In UTF-8 strings, the fold case of a
+ * function figures that out). Except for the 'I' in
+ * Turkic locales, in UTF-8 strings, the fold case of a
* character below 256 occupies the same number of bytes as
* the original (even the Sharp S). Therefore, the space
* needed is the number of bytes remaining plus the number
* of characters that become two bytes when converted to
- * UTF-8. */
+ * UTF-8 plus, in turkish locales, the number of 'I's */
+
+ if (UNLIKELY(*s == 'I')) {
+ const U8 * s_peek = s;
+
+ do {
+ extra++;
+
+ s_peek = (U8 *) memchr(s_peek + 1, 'i',
+ send - (s_peek + 1));
+ } while (s_peek != NULL);
+ }
/* Growing may move things, so have to save and recalculate
* 'd' */
SvCUR_set(dest, len);
len = sv_utf8_upgrade_flags_grow(dest,
SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
- extra);
+ extra
+ + 1 /* Trailing NUL */ );
d = (U8*)SvPVX(dest) + len;
*d++ = UTF8_TWO_BYTE_HI(GREEK_SMALL_LETTER_MU);