+ return SvCUR(sv);
+
+must_be_utf8:
+
+ /* Here, the string should be converted to utf8, either because of an
+ * input flag (two_byte_count = 0), or because a character that
+ * requires 2 bytes was found (two_byte_count = 1). t points either to
+ * the beginning of the string (if we didn't examine anything), or to
+ * the first variant. In either case, everything from s to t - 1 will
+ * occupy only 1 byte each on output.
+ *
+ * There are two main ways to convert. One is to create a new string
+ * and go through the input starting from the beginning, appending each
+ * converted value onto the new string as we go along. It's probably
+ * best to allocate enough space in the string for the worst possible
+ * case rather than possibly running out of space and having to
+ * reallocate and then copy what we've done so far. Since everything
+ * from s to t - 1 is invariant, the destination can be initialized
+ * with these using a fast memory copy
+ *
+ * The other way is to figure out exactly how big the string should be
+ * by parsing the entire input. Then you don't have to make it big
+ * enough to handle the worst possible case, and more importantly, if
+ * the string you already have is large enough, you don't have to
+ * allocate a new string, you can copy the last character in the input
+ * string to the final position(s) that will be occupied by the
+ * converted string and go backwards, stopping at t, since everything
+ * before that is invariant.
+ *
+ * There are advantages and disadvantages to each method.
+ *
+ * In the first method, we can allocate a new string, do the memory
+ * copy from the s to t - 1, and then proceed through the rest of the
+ * string byte-by-byte.
+ *
+ * In the second method, we proceed through the rest of the input
+ * string just calculating how big the converted string will be. Then
+ * there are two cases:
+ * 1) if the string has enough extra space to handle the converted
+ * value. We go backwards through the string, converting until we
+ * get to the position we are at now, and then stop. If this
+ * position is far enough along in the string, this method is
+ * faster than the other method. If the memory copy were the same
+ * speed as the byte-by-byte loop, that position would be about
+ * half-way, as at the half-way mark, parsing to the end and back
+ * is one complete string's parse, the same amount as starting
+ * over and going all the way through. Actually, it would be
+ * somewhat less than half-way, as it's faster to just count bytes
+ * than to also copy, and we don't have the overhead of allocating
+ * a new string, changing the scalar to use it, and freeing the
+ * existing one. But if the memory copy is fast, the break-even
+ * point is somewhere after half way. The counting loop could be
+ * sped up by vectorization, etc, to move the break-even point
+ * further towards the beginning.
+ * 2) if the string doesn't have enough space to handle the converted
+ * value. A new string will have to be allocated, and one might
+ * as well, given that, start from the beginning doing the first
+ * method. We've spent extra time parsing the string and in
+ * exchange all we've gotten is that we know precisely how big to
+ * make the new one. Perl is more optimized for time than space,
+ * so this case is a loser.
+ * So what I've decided to do is not use the 2nd method unless it is
+ * guaranteed that a new string won't have to be allocated, assuming
+ * the worst case. I also decided not to put any more conditions on it
+ * than this, for now. It seems likely that, since the worst case is
+ * twice as big as the unknown portion of the string (plus 1), we won't
+ * be guaranteed enough space, causing us to go to the first method,
+ * unless the string is short, or the first variant character is near
+ * the end of it. In either of these cases, it seems best to use the
+ * 2nd method. The only circumstance I can think of where this would
+ * be really slower is if the string had once had much more data in it
+ * than it does now, but there is still a substantial amount in it */
+
+ {
+ STRLEN invariant_head = t - s;
+ STRLEN size = invariant_head + (e - t) * 2 + 1 + extra;
+ if (SvLEN(sv) < size) {
+
+ /* Here, have decided to allocate a new string */
+
+ U8 *dst;
+ U8 *d;
+
+ Newx(dst, size, U8);
+
+ /* If no known invariants at the beginning of the input string,
+ * set so starts from there. Otherwise, can use memory copy to
+ * get up to where we are now, and then start from here */
+
+ if (invariant_head <= 0) {
+ d = dst;
+ } else {
+ Copy(s, dst, invariant_head, char);
+ d = dst + invariant_head;
+ }
+
+ while (t < e) {
+ const UV uv = NATIVE8_TO_UNI(*t++);
+ if (UNI_IS_INVARIANT(uv))
+ *d++ = (U8)UNI_TO_NATIVE(uv);
+ else {
+ *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
+ *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
+ }
+ }
+ *d = '\0';
+ SvPV_free(sv); /* No longer using pre-existing string */
+ SvPV_set(sv, (char*)dst);
+ SvCUR_set(sv, d - dst);
+ SvLEN_set(sv, size);
+ } else {
+
+ /* Here, have decided to get the exact size of the string.
+ * Currently this happens only when we know that there is
+ * guaranteed enough space to fit the converted string, so
+ * don't have to worry about growing. If two_byte_count is 0,
+ * then t points to the first byte of the string which hasn't
+ * been examined yet. Otherwise two_byte_count is 1, and t
+ * points to the first byte in the string that will expand to
+ * two. Depending on this, start examining at t or 1 after t.
+ * */
+
+ U8 *d = t + two_byte_count;
+
+
+ /* Count up the remaining bytes that expand to two */
+
+ while (d < e) {
+ const U8 chr = *d++;
+ if (! NATIVE_IS_INVARIANT(chr)) two_byte_count++;
+ }
+
+ /* The string will expand by just the number of bytes that
+ * occupy two positions. But we are one afterwards because of
+ * the increment just above. This is the place to put the
+ * trailing NUL, and to set the length before we decrement */
+
+ d += two_byte_count;
+ SvCUR_set(sv, d - s);
+ *d-- = '\0';
+
+
+ /* Having decremented d, it points to the position to put the
+ * very last byte of the expanded string. Go backwards through
+ * the string, copying and expanding as we go, stopping when we
+ * get to the part that is invariant the rest of the way down */
+
+ e--;
+ while (e >= t) {
+ const U8 ch = NATIVE8_TO_UNI(*e--);
+ if (UNI_IS_INVARIANT(ch)) {
+ *d-- = UNI_TO_NATIVE(ch);
+ } else {
+ *d-- = (U8)UTF8_EIGHT_BIT_LO(ch);
+ *d-- = (U8)UTF8_EIGHT_BIT_HI(ch);
+ }
+ }
+ }
+ }