certainly cast into the IV range at IV_MAX, whereas the correct
answer is the UV IV_MAX +1. Hence < ensures that dodgy boundary
cases go to UV */
+#if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
+ if (Perl_isnan(SvNVX(sv))) {
+ SvUV_set(sv, 0);
+ SvIsUV_on(sv);
+ return FALSE;
+ }
+#endif
if (SvNVX(sv) < (NV)IV_MAX + 0.5) {
SvIV_set(sv, I_V(SvNVX(sv)));
if (SvNVX(sv) == (NV) SvIVX(sv)
Safefree(mg->mg_ptr);
else if (mg->mg_len == HEf_SVKEY)
SvREFCNT_dec((SV*)mg->mg_ptr);
- else if (mg->mg_type == PERL_MAGIC_utf8 && mg->mg_ptr)
+ else if (mg->mg_type == PERL_MAGIC_utf8)
Safefree(mg->mg_ptr);
}
if (mg->mg_flags & MGf_REFCOUNTED)
return mg_length(sv);
else
{
- STRLEN len, ulen;
+ STRLEN len;
const U8 *s = (U8*)SvPV_const(sv, len);
- MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : 0;
- if (mg && mg->mg_len != -1 && (mg->mg_len > 0 || len == 0)) {
- ulen = mg->mg_len;
-#ifdef PERL_UTF8_CACHE_ASSERT
- assert(ulen == Perl_utf8_length(aTHX_ s, s + len));
-#endif
- }
- else {
- ulen = Perl_utf8_length(aTHX_ s, s + len);
- if (!mg && !SvREADONLY(sv)) {
- sv_magic(sv, 0, PERL_MAGIC_utf8, 0, 0);
- mg = mg_find(sv, PERL_MAGIC_utf8);
- assert(mg);
+ if (PL_utf8cache) {
+ STRLEN ulen;
+ MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : 0;
+
+ if (mg && mg->mg_len != -1) {
+ ulen = mg->mg_len;
+ if (PL_utf8cache < 0) {
+ const STRLEN real = Perl_utf8_length(aTHX_ s, s + len);
+ if (real != ulen) {
+ /* Need to turn the assertions off otherwise we may
+ recurse infinitely while printing error messages.
+ */
+ SAVEI8(PL_utf8cache);
+ PL_utf8cache = 0;
+ Perl_croak(aTHX_ "panic: sv_len_utf8 cache %"UVf
+ " real %"UVf" for %"SVf,
+ (UV) ulen, (UV) real, sv);
+ }
+ }
+ }
+ else {
+ ulen = Perl_utf8_length(aTHX_ s, s + len);
+ if (!SvREADONLY(sv)) {
+ if (!mg) {
+ mg = sv_magicext(sv, 0, PERL_MAGIC_utf8,
+ &PL_vtbl_utf8, 0, 0);
+ }
+ assert(mg);
+ mg->mg_len = ulen;
+ }
}
- if (mg)
- mg->mg_len = ulen;
+ return ulen;
}
- return ulen;
+ return Perl_utf8_length(aTHX_ s, s + len);
}
}
bool found = FALSE;
if (SvMAGICAL(sv) && !SvREADONLY(sv)) {
- if (!*mgp)
+ if (!*mgp) {
*mgp = sv_magicext(sv, 0, PERL_MAGIC_utf8, (MGVTBL*)&PL_vtbl_utf8, 0, 0);
+ (*mgp)->mg_len = -1;
+ }
assert(*mgp);
if ((*mgp)->mg_ptr)
}
#ifdef PERL_UTF8_CACHE_ASSERT
if (found) {
- U8 *s = start;
+ const U8 *s = start;
I32 n = uoff;
while (n-- && s < send)
*
*/
+static void
+S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8);
+
+static STRLEN
+S_sv_pos_u2b_forwards(pTHX_ const U8 *const start, const U8 *const send,
+ STRLEN uoffset)
+{
+ const U8 *s = start;
+
+ while (s < send && uoffset--)
+ s += UTF8SKIP(s);
+ if (s > send) {
+ /* This is the existing behaviour. Possibly it should be a croak, as
+ it's actually a bounds error */
+ s = send;
+ }
+ return s - start;
+}
+
+
+static STRLEN
+S_sv_pos_u2b_midway(pTHX_ const U8 *const start, const U8 *send,
+ STRLEN uoffset, STRLEN uend)
+{
+ STRLEN backw = uend - uoffset;
+ if (uoffset < 2 * backw) {
+ /* The assumption is that going fowards is twice the speed of going
+ forward (that's where the 2 * backw comes from).
+ (The real figure of course depends on the UTF-8 data.) */
+ return S_sv_pos_u2b_forwards(aTHX_ start, send, uoffset);
+ }
+
+ while (backw--) {
+ send--;
+ while (UTF8_IS_CONTINUATION(*send))
+ send--;
+ }
+ return send - start;
+}
+
+static STRLEN
+S_sv_pos_u2b_cached(pTHX_ SV *sv, MAGIC **mgp, const U8 *const start,
+ const U8 *const send, STRLEN uoffset,
+ STRLEN uoffset0, STRLEN boffset0) {
+ STRLEN boffset;
+ bool found = FALSE;
+
+ assert (uoffset >= uoffset0);
+
+ if (SvMAGICAL(sv) && !SvREADONLY(sv) && PL_utf8cache
+ && (*mgp || (*mgp = mg_find(sv, PERL_MAGIC_utf8)))) {
+ if ((*mgp)->mg_len != -1) {
+ /* If we can take advantage of a passed in offset, do so. */
+ /* In fact, offset0 is either 0, or less than offset, so don't
+ need to worry about the other possibility. */
+ boffset = boffset0
+ + S_sv_pos_u2b_midway(aTHX_ start + boffset0, send,
+ uoffset - uoffset0,
+ (*mgp)->mg_len - uoffset0);
+ found = TRUE;
+ }
+ }
+
+ if (!found || PL_utf8cache < 0) {
+ const STRLEN real_boffset
+ = boffset0 + S_sv_pos_u2b_forwards(aTHX_ start + boffset0,
+ send, uoffset - uoffset0);
+
+ if (found && PL_utf8cache < 0) {
+ if (real_boffset != boffset) {
+ /* Need to turn the assertions off otherwise we may recurse
+ infinitely while printing error messages. */
+ SAVEI8(PL_utf8cache);
+ PL_utf8cache = 0;
+ Perl_croak(aTHX_ "panic: sv_pos_u2b_cache cache %"UVf
+ " real %"UVf" for %"SVf,
+ (UV) boffset, (UV) real_boffset, sv);
+ }
+ }
+ boffset = real_boffset;
+ }
+
+ S_utf8_mg_pos_cache_update(aTHX_ sv, mgp, boffset, uoffset);
+ return boffset;
+}
+
void
Perl_sv_pos_u2b(pTHX_ register SV *sv, I32* offsetp, I32* lenp)
{
start = (U8*)SvPV_const(sv, len);
if (len) {
- STRLEN boffset = 0;
- STRLEN *cache = NULL;
- const U8 *s = start;
- I32 uoffset = *offsetp;
- const U8 * const send = s + len;
+ STRLEN uoffset = (STRLEN) *offsetp;
+ const U8 * const send = start + len;
MAGIC *mg = NULL;
- bool found = utf8_mg_pos(sv, &mg, &cache, 0, offsetp, *offsetp, &s, start, send);
-
- if (!found && uoffset > 0) {
- while (s < send && uoffset--)
- s += UTF8SKIP(s);
- if (s >= send)
- s = send;
- if (utf8_mg_pos_init(sv, &mg, &cache, 0, *offsetp, s, start))
- boffset = cache[1];
- *offsetp = s - start;
- }
- if (lenp) {
- found = FALSE;
- start = s;
- if (utf8_mg_pos(sv, &mg, &cache, 2, lenp, *lenp, &s, start, send)) {
- *lenp -= boffset;
- found = TRUE;
- }
- if (!found && *lenp > 0) {
- I32 ulen = *lenp;
- if (ulen > 0)
- while (s < send && ulen--)
- s += UTF8SKIP(s);
- if (s >= send)
- s = send;
- utf8_mg_pos_init(sv, &mg, &cache, 2, *lenp, s, start);
- }
- *lenp = s - start;
- }
- ASSERT_UTF8_CACHE(cache);
+ STRLEN boffset = S_sv_pos_u2b_cached(aTHX_ sv, &mg, start, send,
+ uoffset, 0, 0);
+
+ *offsetp = (I32) boffset;
+
+ if (lenp) {
+ /* Convert the relative offset to absolute. */
+ STRLEN uoffset2 = uoffset + (STRLEN) *lenp;
+ STRLEN boffset2
+ = S_sv_pos_u2b_cached(aTHX_ sv, &mg, start, send, uoffset2,
+ uoffset, boffset) - boffset;
+
+ *lenp = boffset2;
+ }
}
else {
*offsetp = 0;
*
*/
+
+static STRLEN
+S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target);
+
+static void
+S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8)
+{
+ STRLEN *cache;
+ if (SvREADONLY(sv))
+ return;
+
+ if (!*mgp) {
+ *mgp = sv_magicext(sv, 0, PERL_MAGIC_utf8, (MGVTBL*)&PL_vtbl_utf8, 0,
+ 0);
+ (*mgp)->mg_len = -1;
+ }
+ assert(*mgp);
+
+ if (!(cache = (STRLEN *)(*mgp)->mg_ptr)) {
+ Newxz(cache, PERL_MAGIC_UTF8_CACHESIZE * 2, STRLEN);
+ (*mgp)->mg_ptr = (char *) cache;
+ }
+ assert(cache);
+
+ if (PL_utf8cache < 0) {
+ const U8 *start = (const U8 *) SvPVX_const(sv);
+ const U8 *const end = start + byte;
+ STRLEN realutf8 = 0;
+
+ while (start < end) {
+ start += UTF8SKIP(start);
+ realutf8++;
+ }
+
+ /* Can't use S_sv_pos_b2u_forwards as it will scream warnings on
+ surrogates. FIXME - is it inconsistent that b2u warns, but u2b
+ doesn't? I don't know whether this difference was introduced with
+ the caching code in 5.8.1. */
+
+ if (realutf8 != utf8) {
+ /* Need to turn the assertions off otherwise we may recurse
+ infinitely while printing error messages. */
+ SAVEI8(PL_utf8cache);
+ PL_utf8cache = 0;
+ Perl_croak(aTHX_ "panic: utf8_mg_pos_cache_update cache %"UVf
+ " real %"UVf" for %"SVf, (UV) utf8, (UV) realutf8, sv);
+ }
+ }
+ cache[0] = utf8;
+ cache[1] = byte;
+ ASSERT_UTF8_CACHE(cache);
+ /* Drop the stale "length" cache */
+ cache[2] = 0;
+ cache[3] = 0;
+}
+
+/* If we don't know the character offset of the end of a region, our only
+ option is to walk forwards to the target byte offset. */
+static STRLEN
+S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target)
+{
+ STRLEN len = 0;
+ while (s < target) {
+ STRLEN n = 1;
+
+ /* Call utf8n_to_uvchr() to validate the sequence
+ * (unless a simple non-UTF character) */
+ if (!UTF8_IS_INVARIANT(*s))
+ utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0);
+ if (n > 0) {
+ s += n;
+ len++;
+ }
+ else
+ break;
+ }
+ return len;
+}
+
+/* We already know all of the way, now we may be able to walk back. The same
+ assumption is made as in S_utf8_mg_pos(), namely that walking backward is
+ twice slower than walking forward. */
+static STRLEN
+S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, const U8 *end,
+ STRLEN endu)
+{
+ const STRLEN forw = target - s;
+ STRLEN backw = end - target;
+
+ if (forw < 2 * backw) {
+ return S_sv_pos_b2u_forwards(aTHX_ s, target);
+ }
+
+ while (end > target) {
+ end--;
+ while (UTF8_IS_CONTINUATION(*end)) {
+ end--;
+ }
+ endu--;
+ }
+ return endu;
+}
+
void
Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp)
{
const U8* s;
+ const STRLEN byte = *offsetp;
STRLEN len;
+ MAGIC* mg = NULL;
+ const U8* send;
if (!sv)
return;
s = (const U8*)SvPV_const(sv, len);
- if ((I32)len < *offsetp)
- Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset");
- else {
- const U8* send = s + *offsetp;
- MAGIC* mg = NULL;
- STRLEN *cache = NULL;
-
- len = 0;
- if (SvMAGICAL(sv) && !SvREADONLY(sv)) {
- mg = mg_find(sv, PERL_MAGIC_utf8);
- if (mg && mg->mg_ptr) {
- cache = (STRLEN *) mg->mg_ptr;
- if (cache[1] == (STRLEN)*offsetp) {
- /* An exact match. */
- *offsetp = cache[0];
-
- return;
- }
- else if (cache[1] < (STRLEN)*offsetp) {
- /* We already know part of the way. */
- len = cache[0];
- s += cache[1];
- /* Let the below loop do the rest. */
- }
- else { /* cache[1] > *offsetp */
- /* We already know all of the way, now we may
- * be able to walk back. The same assumption
- * is made as in S_utf8_mg_pos(), namely that
- * walking backward is twice slower than
- * walking forward. */
- const STRLEN forw = *offsetp;
- STRLEN backw = cache[1] - *offsetp;
-
- if (!(forw < 2 * backw)) {
- const U8 *p = s + cache[1];
- STRLEN ubackw = 0;
-
- cache[1] -= backw;
-
- while (backw--) {
- p--;
- while (UTF8_IS_CONTINUATION(*p)) {
- p--;
- backw--;
- }
- ubackw++;
- }
+ if (len < byte)
+ Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset");
- cache[0] -= ubackw;
- *offsetp = cache[0];
+ send = s + byte;
- /* Drop the stale "length" cache */
- cache[2] = 0;
- cache[3] = 0;
+ if (SvMAGICAL(sv) && !SvREADONLY(sv) && PL_utf8cache
+ && (mg = mg_find(sv, PERL_MAGIC_utf8))) {
+ if (mg->mg_ptr) {
+ STRLEN *cache = (STRLEN *) mg->mg_ptr;
+ if (cache[1] == byte) {
+ /* An exact match. */
+ *offsetp = cache[0];
- return;
- }
- }
+ return;
}
- ASSERT_UTF8_CACHE(cache);
- }
-
- while (s < send) {
- STRLEN n = 1;
-
- /* Call utf8n_to_uvchr() to validate the sequence
- * (unless a simple non-UTF character) */
- if (!UTF8_IS_INVARIANT(*s))
- utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0);
- if (n > 0) {
- s += n;
- len++;
+ else if (cache[1] < byte) {
+ /* We already know part of the way. */
+ if (mg->mg_len != -1) {
+ /* Actually, we know the end too. */
+ len = cache[0]
+ + S_sv_pos_b2u_midway(aTHX_ s + cache[1], send,
+ s + len, mg->mg_len - cache[0]);
+ } else {
+ len = cache[0]
+ + S_sv_pos_b2u_forwards(aTHX_ s + cache[1], send);
+ }
}
- else
- break;
- }
+ else { /* cache[1] > byte */
+ len = S_sv_pos_b2u_midway(aTHX_ s, send, s + cache[1],
+ cache[0]);
- if (!SvREADONLY(sv)) {
- if (!mg) {
- sv_magic(sv, 0, PERL_MAGIC_utf8, 0, 0);
- mg = mg_find(sv, PERL_MAGIC_utf8);
}
- assert(mg);
-
- if (!mg->mg_ptr) {
- Newxz(cache, PERL_MAGIC_UTF8_CACHESIZE * 2, STRLEN);
- mg->mg_ptr = (char *) cache;
+ ASSERT_UTF8_CACHE(cache);
+ if (PL_utf8cache < 0) {
+ const STRLEN reallen = S_sv_pos_b2u_forwards(aTHX_ s, send);
+
+ if (len != reallen) {
+ /* Need to turn the assertions off otherwise we may recurse
+ infinitely while printing error messages. */
+ SAVEI8(PL_utf8cache);
+ PL_utf8cache = 0;
+ Perl_croak(aTHX_ "panic: sv_pos_b2u cache %"UVf
+ " real %"UVf" for %"SVf,
+ (UV) len, (UV) reallen, sv);
+ }
}
- assert(cache);
-
- cache[0] = len;
- cache[1] = *offsetp;
- /* Drop the stale "length" cache */
- cache[2] = 0;
- cache[3] = 0;
+ } else if (mg->mg_len != -1) {
+ len = S_sv_pos_b2u_midway(aTHX_ s, send, s + len, mg->mg_len);
+ } else {
+ len = S_sv_pos_b2u_forwards(aTHX_ s, send);
}
-
- *offsetp = len;
}
- return;
+ else {
+ len = S_sv_pos_b2u_forwards(aTHX_ s, send);
+ }
+ *offsetp = len;
+
+ S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len);
}
/*