X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/d83f0a8247ea7458731c8479d8cbf3ee1fa81243..fdbe6d7cef8d87e841fccd98211609c6439b71f8:/sv.c diff --git a/sv.c b/sv.c index 1ec559c..2c8f174 100644 --- a/sv.c +++ b/sv.c @@ -1895,6 +1895,13 @@ S_sv_2iuv_common(pTHX_ SV *sv) { certainly cast into the IV range at IV_MAX, whereas the correct answer is the UV IV_MAX +1. Hence < ensures that dodgy boundary cases go to UV */ +#if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan) + if (Perl_isnan(SvNVX(sv))) { + SvUV_set(sv, 0); + SvIsUV_on(sv); + return FALSE; + } +#endif if (SvNVX(sv) < (NV)IV_MAX + 0.5) { SvIV_set(sv, I_V(SvNVX(sv))); if (SvNVX(sv) == (NV) SvIVX(sv) @@ -4576,7 +4583,7 @@ Perl_sv_unmagic(pTHX_ SV *sv, int type) Safefree(mg->mg_ptr); else if (mg->mg_len == HEf_SVKEY) SvREFCNT_dec((SV*)mg->mg_ptr); - else if (mg->mg_type == PERL_MAGIC_utf8 && mg->mg_ptr) + else if (mg->mg_type == PERL_MAGIC_utf8) Safefree(mg->mg_ptr); } if (mg->mg_flags & MGf_REFCOUNTED) @@ -5285,27 +5292,43 @@ Perl_sv_len_utf8(pTHX_ register SV *sv) return mg_length(sv); else { - STRLEN len, ulen; + STRLEN len; const U8 *s = (U8*)SvPV_const(sv, len); - MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : 0; - if (mg && mg->mg_len != -1 && (mg->mg_len > 0 || len == 0)) { - ulen = mg->mg_len; -#ifdef PERL_UTF8_CACHE_ASSERT - assert(ulen == Perl_utf8_length(aTHX_ s, s + len)); -#endif - } - else { - ulen = Perl_utf8_length(aTHX_ s, s + len); - if (!mg && !SvREADONLY(sv)) { - sv_magic(sv, 0, PERL_MAGIC_utf8, 0, 0); - mg = mg_find(sv, PERL_MAGIC_utf8); - assert(mg); + if (PL_utf8cache) { + STRLEN ulen; + MAGIC *mg = SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : 0; + + if (mg && mg->mg_len != -1) { + ulen = mg->mg_len; + if (PL_utf8cache < 0) { + const STRLEN real = Perl_utf8_length(aTHX_ s, s + len); + if (real != ulen) { + /* Need to turn the assertions off otherwise we may + recurse infinitely while printing error messages. + */ + SAVEI8(PL_utf8cache); + PL_utf8cache = 0; + Perl_croak(aTHX_ "panic: sv_len_utf8 cache %"UVf + " real %"UVf" for %"SVf, + (UV) ulen, (UV) real, sv); + } + } + } + else { + ulen = Perl_utf8_length(aTHX_ s, s + len); + if (!SvREADONLY(sv)) { + if (!mg) { + mg = sv_magicext(sv, 0, PERL_MAGIC_utf8, + &PL_vtbl_utf8, 0, 0); + } + assert(mg); + mg->mg_len = ulen; + } } - if (mg) - mg->mg_len = ulen; + return ulen; } - return ulen; + return Perl_utf8_length(aTHX_ s, s + len); } } @@ -5326,8 +5349,10 @@ S_utf8_mg_pos_init(pTHX_ SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i, bool found = FALSE; if (SvMAGICAL(sv) && !SvREADONLY(sv)) { - if (!*mgp) + if (!*mgp) { *mgp = sv_magicext(sv, 0, PERL_MAGIC_utf8, (MGVTBL*)&PL_vtbl_utf8, 0, 0); + (*mgp)->mg_len = -1; + } assert(*mgp); if ((*mgp)->mg_ptr) @@ -5445,7 +5470,7 @@ S_utf8_mg_pos(pTHX_ SV *sv, MAGIC **mgp, STRLEN **cachep, I32 i, I32 *offsetp, I } #ifdef PERL_UTF8_CACHE_ASSERT if (found) { - U8 *s = start; + const U8 *s = start; I32 n = uoff; while (n-- && s < send) @@ -5483,6 +5508,92 @@ type coercion. * */ +static void +S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8); + +static STRLEN +S_sv_pos_u2b_forwards(pTHX_ const U8 *const start, const U8 *const send, + STRLEN uoffset) +{ + const U8 *s = start; + + while (s < send && uoffset--) + s += UTF8SKIP(s); + if (s > send) { + /* This is the existing behaviour. Possibly it should be a croak, as + it's actually a bounds error */ + s = send; + } + return s - start; +} + + +static STRLEN +S_sv_pos_u2b_midway(pTHX_ const U8 *const start, const U8 *send, + STRLEN uoffset, STRLEN uend) +{ + STRLEN backw = uend - uoffset; + if (uoffset < 2 * backw) { + /* The assumption is that going fowards is twice the speed of going + forward (that's where the 2 * backw comes from). + (The real figure of course depends on the UTF-8 data.) */ + return S_sv_pos_u2b_forwards(aTHX_ start, send, uoffset); + } + + while (backw--) { + send--; + while (UTF8_IS_CONTINUATION(*send)) + send--; + } + return send - start; +} + +static STRLEN +S_sv_pos_u2b_cached(pTHX_ SV *sv, MAGIC **mgp, const U8 *const start, + const U8 *const send, STRLEN uoffset, + STRLEN uoffset0, STRLEN boffset0) { + STRLEN boffset; + bool found = FALSE; + + assert (uoffset >= uoffset0); + + if (SvMAGICAL(sv) && !SvREADONLY(sv) && PL_utf8cache + && (*mgp || (*mgp = mg_find(sv, PERL_MAGIC_utf8)))) { + if ((*mgp)->mg_len != -1) { + /* If we can take advantage of a passed in offset, do so. */ + /* In fact, offset0 is either 0, or less than offset, so don't + need to worry about the other possibility. */ + boffset = boffset0 + + S_sv_pos_u2b_midway(aTHX_ start + boffset0, send, + uoffset - uoffset0, + (*mgp)->mg_len - uoffset0); + found = TRUE; + } + } + + if (!found || PL_utf8cache < 0) { + const STRLEN real_boffset + = boffset0 + S_sv_pos_u2b_forwards(aTHX_ start + boffset0, + send, uoffset - uoffset0); + + if (found && PL_utf8cache < 0) { + if (real_boffset != boffset) { + /* Need to turn the assertions off otherwise we may recurse + infinitely while printing error messages. */ + SAVEI8(PL_utf8cache); + PL_utf8cache = 0; + Perl_croak(aTHX_ "panic: sv_pos_u2b_cache cache %"UVf + " real %"UVf" for %"SVf, + (UV) boffset, (UV) real_boffset, sv); + } + } + boffset = real_boffset; + } + + S_utf8_mg_pos_cache_update(aTHX_ sv, mgp, boffset, uoffset); + return boffset; +} + void Perl_sv_pos_u2b(pTHX_ register SV *sv, I32* offsetp, I32* lenp) { @@ -5494,42 +5605,23 @@ Perl_sv_pos_u2b(pTHX_ register SV *sv, I32* offsetp, I32* lenp) start = (U8*)SvPV_const(sv, len); if (len) { - STRLEN boffset = 0; - STRLEN *cache = NULL; - const U8 *s = start; - I32 uoffset = *offsetp; - const U8 * const send = s + len; + STRLEN uoffset = (STRLEN) *offsetp; + const U8 * const send = start + len; MAGIC *mg = NULL; - bool found = utf8_mg_pos(sv, &mg, &cache, 0, offsetp, *offsetp, &s, start, send); - - if (!found && uoffset > 0) { - while (s < send && uoffset--) - s += UTF8SKIP(s); - if (s >= send) - s = send; - if (utf8_mg_pos_init(sv, &mg, &cache, 0, *offsetp, s, start)) - boffset = cache[1]; - *offsetp = s - start; - } - if (lenp) { - found = FALSE; - start = s; - if (utf8_mg_pos(sv, &mg, &cache, 2, lenp, *lenp, &s, start, send)) { - *lenp -= boffset; - found = TRUE; - } - if (!found && *lenp > 0) { - I32 ulen = *lenp; - if (ulen > 0) - while (s < send && ulen--) - s += UTF8SKIP(s); - if (s >= send) - s = send; - utf8_mg_pos_init(sv, &mg, &cache, 2, *lenp, s, start); - } - *lenp = s - start; - } - ASSERT_UTF8_CACHE(cache); + STRLEN boffset = S_sv_pos_u2b_cached(aTHX_ sv, &mg, start, send, + uoffset, 0, 0); + + *offsetp = (I32) boffset; + + if (lenp) { + /* Convert the relative offset to absolute. */ + STRLEN uoffset2 = uoffset + (STRLEN) *lenp; + STRLEN boffset2 + = S_sv_pos_u2b_cached(aTHX_ sv, &mg, start, send, uoffset2, + uoffset, boffset) - boffset; + + *lenp = boffset2; + } } else { *offsetp = 0; @@ -5557,117 +5649,181 @@ Handles magic and type coercion. * */ + +static STRLEN +S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target); + +static void +S_utf8_mg_pos_cache_update(pTHX_ SV *sv, MAGIC **mgp, STRLEN byte, STRLEN utf8) +{ + STRLEN *cache; + if (SvREADONLY(sv)) + return; + + if (!*mgp) { + *mgp = sv_magicext(sv, 0, PERL_MAGIC_utf8, (MGVTBL*)&PL_vtbl_utf8, 0, + 0); + (*mgp)->mg_len = -1; + } + assert(*mgp); + + if (!(cache = (STRLEN *)(*mgp)->mg_ptr)) { + Newxz(cache, PERL_MAGIC_UTF8_CACHESIZE * 2, STRLEN); + (*mgp)->mg_ptr = (char *) cache; + } + assert(cache); + + if (PL_utf8cache < 0) { + const U8 *start = (const U8 *) SvPVX_const(sv); + const U8 *const end = start + byte; + STRLEN realutf8 = 0; + + while (start < end) { + start += UTF8SKIP(start); + realutf8++; + } + + /* Can't use S_sv_pos_b2u_forwards as it will scream warnings on + surrogates. FIXME - is it inconsistent that b2u warns, but u2b + doesn't? I don't know whether this difference was introduced with + the caching code in 5.8.1. */ + + if (realutf8 != utf8) { + /* Need to turn the assertions off otherwise we may recurse + infinitely while printing error messages. */ + SAVEI8(PL_utf8cache); + PL_utf8cache = 0; + Perl_croak(aTHX_ "panic: utf8_mg_pos_cache_update cache %"UVf + " real %"UVf" for %"SVf, (UV) utf8, (UV) realutf8, sv); + } + } + cache[0] = utf8; + cache[1] = byte; + ASSERT_UTF8_CACHE(cache); + /* Drop the stale "length" cache */ + cache[2] = 0; + cache[3] = 0; +} + +/* If we don't know the character offset of the end of a region, our only + option is to walk forwards to the target byte offset. */ +static STRLEN +S_sv_pos_b2u_forwards(pTHX_ const U8 *s, const U8 *const target) +{ + STRLEN len = 0; + while (s < target) { + STRLEN n = 1; + + /* Call utf8n_to_uvchr() to validate the sequence + * (unless a simple non-UTF character) */ + if (!UTF8_IS_INVARIANT(*s)) + utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0); + if (n > 0) { + s += n; + len++; + } + else + break; + } + return len; +} + +/* We already know all of the way, now we may be able to walk back. The same + assumption is made as in S_utf8_mg_pos(), namely that walking backward is + twice slower than walking forward. */ +static STRLEN +S_sv_pos_b2u_midway(pTHX_ const U8 *s, const U8 *const target, const U8 *end, + STRLEN endu) +{ + const STRLEN forw = target - s; + STRLEN backw = end - target; + + if (forw < 2 * backw) { + return S_sv_pos_b2u_forwards(aTHX_ s, target); + } + + while (end > target) { + end--; + while (UTF8_IS_CONTINUATION(*end)) { + end--; + } + endu--; + } + return endu; +} + void Perl_sv_pos_b2u(pTHX_ register SV* sv, I32* offsetp) { const U8* s; + const STRLEN byte = *offsetp; STRLEN len; + MAGIC* mg = NULL; + const U8* send; if (!sv) return; s = (const U8*)SvPV_const(sv, len); - if ((I32)len < *offsetp) - Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset"); - else { - const U8* send = s + *offsetp; - MAGIC* mg = NULL; - STRLEN *cache = NULL; - - len = 0; - if (SvMAGICAL(sv) && !SvREADONLY(sv)) { - mg = mg_find(sv, PERL_MAGIC_utf8); - if (mg && mg->mg_ptr) { - cache = (STRLEN *) mg->mg_ptr; - if (cache[1] == (STRLEN)*offsetp) { - /* An exact match. */ - *offsetp = cache[0]; - - return; - } - else if (cache[1] < (STRLEN)*offsetp) { - /* We already know part of the way. */ - len = cache[0]; - s += cache[1]; - /* Let the below loop do the rest. */ - } - else { /* cache[1] > *offsetp */ - /* We already know all of the way, now we may - * be able to walk back. The same assumption - * is made as in S_utf8_mg_pos(), namely that - * walking backward is twice slower than - * walking forward. */ - const STRLEN forw = *offsetp; - STRLEN backw = cache[1] - *offsetp; - - if (!(forw < 2 * backw)) { - const U8 *p = s + cache[1]; - STRLEN ubackw = 0; - - cache[1] -= backw; - - while (backw--) { - p--; - while (UTF8_IS_CONTINUATION(*p)) { - p--; - backw--; - } - ubackw++; - } + if (len < byte) + Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset"); - cache[0] -= ubackw; - *offsetp = cache[0]; + send = s + byte; - /* Drop the stale "length" cache */ - cache[2] = 0; - cache[3] = 0; + if (SvMAGICAL(sv) && !SvREADONLY(sv) && PL_utf8cache + && (mg = mg_find(sv, PERL_MAGIC_utf8))) { + if (mg->mg_ptr) { + STRLEN *cache = (STRLEN *) mg->mg_ptr; + if (cache[1] == byte) { + /* An exact match. */ + *offsetp = cache[0]; - return; - } - } + return; } - ASSERT_UTF8_CACHE(cache); - } - - while (s < send) { - STRLEN n = 1; - - /* Call utf8n_to_uvchr() to validate the sequence - * (unless a simple non-UTF character) */ - if (!UTF8_IS_INVARIANT(*s)) - utf8n_to_uvchr(s, UTF8SKIP(s), &n, 0); - if (n > 0) { - s += n; - len++; + else if (cache[1] < byte) { + /* We already know part of the way. */ + if (mg->mg_len != -1) { + /* Actually, we know the end too. */ + len = cache[0] + + S_sv_pos_b2u_midway(aTHX_ s + cache[1], send, + s + len, mg->mg_len - cache[0]); + } else { + len = cache[0] + + S_sv_pos_b2u_forwards(aTHX_ s + cache[1], send); + } } - else - break; - } + else { /* cache[1] > byte */ + len = S_sv_pos_b2u_midway(aTHX_ s, send, s + cache[1], + cache[0]); - if (!SvREADONLY(sv)) { - if (!mg) { - sv_magic(sv, 0, PERL_MAGIC_utf8, 0, 0); - mg = mg_find(sv, PERL_MAGIC_utf8); } - assert(mg); - - if (!mg->mg_ptr) { - Newxz(cache, PERL_MAGIC_UTF8_CACHESIZE * 2, STRLEN); - mg->mg_ptr = (char *) cache; + ASSERT_UTF8_CACHE(cache); + if (PL_utf8cache < 0) { + const STRLEN reallen = S_sv_pos_b2u_forwards(aTHX_ s, send); + + if (len != reallen) { + /* Need to turn the assertions off otherwise we may recurse + infinitely while printing error messages. */ + SAVEI8(PL_utf8cache); + PL_utf8cache = 0; + Perl_croak(aTHX_ "panic: sv_pos_b2u cache %"UVf + " real %"UVf" for %"SVf, + (UV) len, (UV) reallen, sv); + } } - assert(cache); - - cache[0] = len; - cache[1] = *offsetp; - /* Drop the stale "length" cache */ - cache[2] = 0; - cache[3] = 0; + } else if (mg->mg_len != -1) { + len = S_sv_pos_b2u_midway(aTHX_ s, send, s + len, mg->mg_len); + } else { + len = S_sv_pos_b2u_forwards(aTHX_ s, send); } - - *offsetp = len; } - return; + else { + len = S_sv_pos_b2u_forwards(aTHX_ s, send); + } + *offsetp = len; + + S_utf8_mg_pos_cache_update(aTHX_ sv, &mg, byte, len); } /*