From b3ab6785f6871a84567168e1bd0426ff2f66d282 Mon Sep 17 00:00:00 2001 From: karl williamson Date: Fri, 2 Jan 2009 11:22:02 +0100 Subject: [PATCH] Faster sv_utf8_upgrade() Consider what currently happens when the tokenizer is scanning a string. It looks through it byte-by-byte until it finds a character that forces it to decide to go to utf8. It then calls sv_utf8_upgrade() with the portion of the string scanned so far. sv_utf8_upgrade() starts over from the beginning, and scans the string byte-by-byte until it finds a character that varies between non-utf8 and utf8. It then calls bytes_to_utf8(). bytes_to_utf8() allocates a new string that can handle the worst case expansion, 2n+1, of the entire string, and starts over from the beginning, and scans the input string byte-by-byte copying and converting each character to the output string as it goes. It doesn't return the size of the new string, so sv_utf8_upgrade() assumes it is only as big as what actually got converted, throwing away knowledge of any spare. It then returns to the tokenizer, which immediately does a grow to get space for the unparsed input. This is likely to cause a new string to be allocated and copied from the one we had just created, even if that string in actuality had enough space in it. Thus, the invariant head portion of the string is scanned 3 times, and probably 2 strings will be allocated and copied. My solution to cutting this down is to do several things. First, I added an extra flag for sv_utf8_upgrade that says don't bother to check if the string needs to be converted to utf8, just assume it does. This eliminates one of the passes. I also added a new parameter to sv_utf8_upgrade that says when you return, I want this much unused space in the string. That eliminates the extra grow. This was all done by renaming the current work-horse function from sv_utf8_upgrade_flags to be sv_utf8_upgrade_flags_grow() and making the current function name be a macro which calls the revised one with a 0 grow parameter. I also improved the internal efficiency of sv_utf8_upgrade so that when it does scan the string, it doesn't call bytes_to_utf8, but does the conversion itself, using a fast memory copy instead of the byte-oriented one for the invariant header, and it uses that header to get a better estimate of the needed size of the new string, and it doesn't throw away the knowledge of the allocated size. And, if it is clear without scanning the whole string that the conversion will fit in the already allocated string, it just uses that instead of allocating and copying a new one, using the algorithm I copied from the tokenizer. (In this case it does have to finish scanning the whole string to get the correct size.) The comments have details. It still is byte-oriented. Vectorization et. al. could yield performance improvements. One idea for that is in the comments. The patch also includes a new synonym I created which is a more accurate name than NATIVE_TO_ASCII. --- embed.fnc | 3 +- embed.h | 4 +- global.sym | 2 +- pod/perlintern.pod | 19 ---- proto.h | 9 +- sv.c | 253 +++++++++++++++++++++++++++++++++++++++++++++++------ sv.h | 5 ++ utf8.h | 3 +- utfebcdic.h | 5 +- 9 files changed, 246 insertions(+), 57 deletions(-) diff --git a/embed.fnc b/embed.fnc index 1a0e5d3..749b975 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1871,7 +1871,8 @@ Apd |void |sv_setsv_flags |NN SV *dstr|NULLOK SV *sstr|const I32 flags Apd |void |sv_catpvn_flags|NN SV *const dstr|NN const char *sstr|const STRLEN len \ |const I32 flags Apd |void |sv_catsv_flags |NN SV *const dsv|NULLOK SV *const ssv|const I32 flags -Apd |STRLEN |sv_utf8_upgrade_flags|NN SV *const sv|const I32 flags +Apmd |STRLEN |sv_utf8_upgrade_flags|NN SV *const sv|const I32 flags +Apd |STRLEN |sv_utf8_upgrade_flags_grow|NN SV *const sv|const I32 flags|STRLEN extra Apd |char* |sv_pvn_force_flags|NN SV *const sv|NULLOK STRLEN *const lp|const I32 flags Apd |void |sv_copypv |NN SV *const dsv|NN SV *const ssv Ap |char* |my_atof2 |NN const char *s|NN NV* value diff --git a/embed.h b/embed.h index b1f9741..2d43283 100644 --- a/embed.h +++ b/embed.h @@ -1649,7 +1649,7 @@ #define sv_setsv_flags Perl_sv_setsv_flags #define sv_catpvn_flags Perl_sv_catpvn_flags #define sv_catsv_flags Perl_sv_catsv_flags -#define sv_utf8_upgrade_flags Perl_sv_utf8_upgrade_flags +#define sv_utf8_upgrade_flags_grow Perl_sv_utf8_upgrade_flags_grow #define sv_pvn_force_flags Perl_sv_pvn_force_flags #define sv_copypv Perl_sv_copypv #define my_atof2 Perl_my_atof2 @@ -4007,7 +4007,7 @@ #define sv_setsv_flags(a,b,c) Perl_sv_setsv_flags(aTHX_ a,b,c) #define sv_catpvn_flags(a,b,c,d) Perl_sv_catpvn_flags(aTHX_ a,b,c,d) #define sv_catsv_flags(a,b,c) Perl_sv_catsv_flags(aTHX_ a,b,c) -#define sv_utf8_upgrade_flags(a,b) Perl_sv_utf8_upgrade_flags(aTHX_ a,b) +#define sv_utf8_upgrade_flags_grow(a,b,c) Perl_sv_utf8_upgrade_flags_grow(aTHX_ a,b,c) #define sv_pvn_force_flags(a,b,c) Perl_sv_pvn_force_flags(aTHX_ a,b,c) #define sv_copypv(a,b) Perl_sv_copypv(aTHX_ a,b) #define my_atof2(a,b) Perl_my_atof2(aTHX_ a,b) diff --git a/global.sym b/global.sym index 0b21dcd..1bc756a 100644 --- a/global.sym +++ b/global.sym @@ -709,7 +709,7 @@ Perl_Slab_Free Perl_sv_setsv_flags Perl_sv_catpvn_flags Perl_sv_catsv_flags -Perl_sv_utf8_upgrade_flags +Perl_sv_utf8_upgrade_flags_grow Perl_sv_pvn_force_flags Perl_sv_copypv Perl_my_atof2 diff --git a/pod/perlintern.pod b/pod/perlintern.pod index e622841..4107d5e 100644 --- a/pod/perlintern.pod +++ b/pod/perlintern.pod @@ -470,25 +470,6 @@ Found in file mg.c =over 8 -=item mro_get_linear_isa_c3 -X - -Returns the C3 linearization of @ISA -the given stash. The return value is a read-only AV*. -C should be 0 (it is used internally in this -function's recursion). - -You are responsible for C on the -return value if you plan to store it anywhere -semi-permanently (otherwise it might be deleted -out from under you the next time the cache is -invalidated). - - AV* mro_get_linear_isa_c3(HV* stash, U32 level) - -=for hackers -Found in file mro.c - =item mro_get_linear_isa_dfs X diff --git a/proto.h b/proto.h index ffbc9fb..4207599 100644 --- a/proto.h +++ b/proto.h @@ -5951,11 +5951,16 @@ PERL_CALLCONV void Perl_sv_catsv_flags(pTHX_ SV *const dsv, SV *const ssv, const #define PERL_ARGS_ASSERT_SV_CATSV_FLAGS \ assert(dsv) -PERL_CALLCONV STRLEN Perl_sv_utf8_upgrade_flags(pTHX_ SV *const sv, const I32 flags) - __attribute__nonnull__(pTHX_1); +/* PERL_CALLCONV STRLEN Perl_sv_utf8_upgrade_flags(pTHX_ SV *const sv, const I32 flags) + __attribute__nonnull__(pTHX_1); */ #define PERL_ARGS_ASSERT_SV_UTF8_UPGRADE_FLAGS \ assert(sv) +PERL_CALLCONV STRLEN Perl_sv_utf8_upgrade_flags_grow(pTHX_ SV *const sv, const I32 flags, STRLEN extra) + __attribute__nonnull__(pTHX_1); +#define PERL_ARGS_ASSERT_SV_UTF8_UPGRADE_FLAGS_GROW \ + assert(sv) + PERL_CALLCONV char* Perl_sv_pvn_force_flags(pTHX_ SV *const sv, STRLEN *const lp, const I32 flags) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_SV_PVN_FORCE_FLAGS \ diff --git a/sv.c b/sv.c index 9a5d0e3..470d5b1 100644 --- a/sv.c +++ b/sv.c @@ -3173,14 +3173,44 @@ This is not as a general purpose byte encoding to Unicode interface: use the Encode extension for that. =cut + +The grow version is currently not externally documented. It adds a parameter, +extra, which is the number of unused bytes the string of 'sv' is guaranteed to +have free after it upon return. This allows the caller to reserve extra space +that it intends to fill, to avoid extra grows. + +Also externally undocumented for the moment is the flag SV_FORCE_UTF8_UPGRADE, +which can be used to tell this function to not first check to see if there are +any characters that are different in UTF-8 (variant characters) which would +force it to allocate a new string to sv, but to assume there are. Typically +this flag is used by a routine that has already parsed the string to find that +there are such characters, and passes this information on so that the work +doesn't have to be repeated. + +(One might think that the calling routine could pass in the position of the +first such variant, so it wouldn't have to be found again. But that is not the +case, because typically when the caller is likely to use this flag, it won't be +calling this routine unless it finds something that won't fit into a byte. +Otherwise it tries to not upgrade and just use bytes. But some things that +do fit into a byte are variants in utf8, and the caller may not have been +keeping track of these.) + +If the routine itself changes the string, it adds a trailing NUL. Such a NUL +isn't guaranteed due to having other routines do the work in some input cases, +or if the input is already flagged as being in utf8. + +The speed of this could perhaps be improved for many cases if someone wanted to +write a fast function that counts the number of variant characters in a string, +especially if it could return the position of the first one. + */ STRLEN -Perl_sv_utf8_upgrade_flags(pTHX_ register SV *const sv, const I32 flags) +Perl_sv_utf8_upgrade_flags_grow(pTHX_ register SV *const sv, const I32 flags, STRLEN extra) { dVAR; - PERL_ARGS_ASSERT_SV_UTF8_UPGRADE_FLAGS; + PERL_ARGS_ASSERT_SV_UTF8_UPGRADE_FLAGS_GROW; if (sv == &PL_sv_undef) return 0; @@ -3188,14 +3218,17 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *const sv, const I32 flags) STRLEN len = 0; if (SvREADONLY(sv) && (SvPOKp(sv) || SvIOKp(sv) || SvNOKp(sv))) { (void) sv_2pv_flags(sv,&len, flags); - if (SvUTF8(sv)) + if (SvUTF8(sv)) { + if (extra) SvGROW(sv, SvCUR(sv) + extra); return len; + } } else { (void) SvPV_force(sv,len); } } if (SvUTF8(sv)) { + if (extra) SvGROW(sv, SvCUR(sv) + extra); return SvCUR(sv); } @@ -3203,42 +3236,204 @@ Perl_sv_utf8_upgrade_flags(pTHX_ register SV *const sv, const I32 flags) sv_force_normal_flags(sv, 0); } - if (PL_encoding && !(flags & SV_UTF8_NO_ENCODING)) + if (PL_encoding && !(flags & SV_UTF8_NO_ENCODING)) { sv_recode_to_utf8(sv, PL_encoding); - else { /* Assume Latin-1/EBCDIC */ + if (extra) SvGROW(sv, SvCUR(sv) + extra); + return SvCUR(sv); + } + + if (SvCUR(sv) > 0) { /* Assume Latin-1/EBCDIC */ /* This function could be much more efficient if we * had a FLAG in SVs to signal if there are any variant * chars in the PV. Given that there isn't such a flag - * make the loop as fast as possible. */ - const U8 * const s = (U8 *) SvPVX_const(sv); - const U8 * const e = (U8 *) SvEND(sv); - const U8 *t = s; + * make the loop as fast as possible (although there are certainly ways + * to speed this up, eg. through vectorization) */ + U8 * s = (U8 *) SvPVX_const(sv); + U8 * e = (U8 *) SvEND(sv); + U8 *t = s; + STRLEN two_byte_count = 0; + if (flags & SV_FORCE_UTF8_UPGRADE) goto must_be_utf8; + + /* See if really will need to convert to utf8. We mustn't rely on our + * incoming SV being well formed and having a trailing '\0', as certain + * code in pp_formline can send us partially built SVs. */ + while (t < e) { const U8 ch = *t++; - /* Check for variant */ - if (!NATIVE_IS_INVARIANT(ch)) { - STRLEN len = SvCUR(sv); - /* *Currently* bytes_to_utf8() adds a '\0' after every string - it converts. This isn't documented. It's not clear if it's - a bad thing to be doing, and should be changed to do exactly - what the documentation says. If so, this code will have to - be changed. - As is, we mustn't rely on our incoming SV being well formed - and having a trailing '\0', as certain code in pp_formline - can send us partially built SVs. */ - U8 * const recoded = bytes_to_utf8((U8*)s, &len); - - SvPV_free(sv); /* No longer using what was there before. */ - SvPV_set(sv, (char*)recoded); - SvCUR_set(sv, len); - SvLEN_set(sv, len + 1); /* No longer know the real size. */ - break; - } + if (NATIVE_IS_INVARIANT(ch)) continue; + + t--; /* t already incremented; re-point to first variant */ + two_byte_count = 1; + goto must_be_utf8; } - /* Mark as UTF-8 even if no variant - saves scanning loop */ + + /* utf8 conversion not needed because all are invariants. Mark as + * UTF-8 even if no variant - saves scanning loop */ SvUTF8_on(sv); + return SvCUR(sv); + +must_be_utf8: + + /* Here, the string should be converted to utf8, either because of an + * input flag (two_byte_count = 0), or because a character that + * requires 2 bytes was found (two_byte_count = 1). t points either to + * the beginning of the string (if we didn't examine anything), or to + * the first variant. In either case, everything from s to t - 1 will + * occupy only 1 byte each on output. + * + * There are two main ways to convert. One is to create a new string + * and go through the input starting from the beginning, appending each + * converted value onto the new string as we go along. It's probably + * best to allocate enough space in the string for the worst possible + * case rather than possibly running out of space and having to + * reallocate and then copy what we've done so far. Since everything + * from s to t - 1 is invariant, the destination can be initialized + * with these using a fast memory copy + * + * The other way is to figure out exactly how big the string should be + * by parsing the entire input. Then you don't have to make it big + * enough to handle the worst possible case, and more importantly, if + * the string you already have is large enough, you don't have to + * allocate a new string, you can copy the last character in the input + * string to the final position(s) that will be occupied by the + * converted string and go backwards, stopping at t, since everything + * before that is invariant. + * + * There are advantages and disadvantages to each method. + * + * In the first method, we can allocate a new string, do the memory + * copy from the s to t - 1, and then proceed through the rest of the + * string byte-by-byte. + * + * In the second method, we proceed through the rest of the input + * string just calculating how big the converted string will be. Then + * there are two cases: + * 1) if the string has enough extra space to handle the converted + * value. We go backwards through the string, converting until we + * get to the position we are at now, and then stop. If this + * position is far enough along in the string, this method is + * faster than the other method. If the memory copy were the same + * speed as the byte-by-byte loop, that position would be about + * half-way, as at the half-way mark, parsing to the end and back + * is one complete string's parse, the same amount as starting + * over and going all the way through. Actually, it would be + * somewhat less than half-way, as it's faster to just count bytes + * than to also copy, and we don't have the overhead of allocating + * a new string, changing the scalar to use it, and freeing the + * existing one. But if the memory copy is fast, the break-even + * point is somewhere after half way. The counting loop could be + * sped up by vectorization, etc, to move the break-even point + * further towards the beginning. + * 2) if the string doesn't have enough space to handle the converted + * value. A new string will have to be allocated, and one might + * as well, given that, start from the beginning doing the first + * method. We've spent extra time parsing the string and in + * exchange all we've gotten is that we know precisely how big to + * make the new one. Perl is more optimized for time than space, + * so this case is a loser. + * So what I've decided to do is not use the 2nd method unless it is + * guaranteed that a new string won't have to be allocated, assuming + * the worst case. I also decided not to put any more conditions on it + * than this, for now. It seems likely that, since the worst case is + * twice as big as the unknown portion of the string (plus 1), we won't + * be guaranteed enough space, causing us to go to the first method, + * unless the string is short, or the first variant character is near + * the end of it. In either of these cases, it seems best to use the + * 2nd method. The only circumstance I can think of where this would + * be really slower is if the string had once had much more data in it + * than it does now, but there is still a substantial amount in it */ + + { + STRLEN invariant_head = t - s; + STRLEN size = invariant_head + (e - t) * 2 + 1 + extra; + if (SvLEN(sv) < size) { + + /* Here, have decided to allocate a new string */ + + U8 *dst; + U8 *d; + + Newx(dst, size, U8); + + /* If no known invariants at the beginning of the input string, + * set so starts from there. Otherwise, can use memory copy to + * get up to where we are now, and then start from here */ + + if (invariant_head <= 0) { + d = dst; + } else { + Copy(s, dst, invariant_head, char); + d = dst + invariant_head; + } + + while (t < e) { + const UV uv = NATIVE8_TO_UNI(*t++); + if (UNI_IS_INVARIANT(uv)) + *d++ = (U8)UNI_TO_NATIVE(uv); + else { + *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); + *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); + } + } + *d = '\0'; + SvPV_free(sv); /* No longer using pre-existing string */ + SvPV_set(sv, (char*)dst); + SvCUR_set(sv, d - dst); + SvLEN_set(sv, size); + } else { + + /* Here, have decided to get the exact size of the string. + * Currently this happens only when we know that there is + * guaranteed enough space to fit the converted string, so + * don't have to worry about growing. If two_byte_count is 0, + * then t points to the first byte of the string which hasn't + * been examined yet. Otherwise two_byte_count is 1, and t + * points to the first byte in the string that will expand to + * two. Depending on this, start examining at t or 1 after t. + * */ + + U8 *d = t + two_byte_count; + + + /* Count up the remaining bytes that expand to two */ + + while (d < e) { + const U8 chr = *d++; + if (! NATIVE_IS_INVARIANT(chr)) two_byte_count++; + } + + /* The string will expand by just the number of bytes that + * occupy two positions. But we are one afterwards because of + * the increment just above. This is the place to put the + * trailing NUL, and to set the length before we decrement */ + + d += two_byte_count; + SvCUR_set(sv, d - s); + *d-- = '\0'; + + + /* Having decremented d, it points to the position to put the + * very last byte of the expanded string. Go backwards through + * the string, copying and expanding as we go, stopping when we + * get to the part that is invariant the rest of the way down */ + + e--; + while (e >= t) { + const U8 ch = NATIVE8_TO_UNI(*e--); + if (UNI_IS_INVARIANT(ch)) { + *d-- = UNI_TO_NATIVE(ch); + } else { + *d-- = (U8)UTF8_EIGHT_BIT_LO(ch); + *d-- = (U8)UTF8_EIGHT_BIT_HI(ch); + } + } + } + } } + + /* Mark as UTF-8 even if no variant - saves scanning loop */ + SvUTF8_on(sv); return SvCUR(sv); } diff --git a/sv.h b/sv.h index 43bc541..9acb84e 100644 --- a/sv.h +++ b/sv.h @@ -1716,6 +1716,10 @@ Like sv_utf8_upgrade, but doesn't do magic on C #define SV_COW_OTHER_PVS 1024 /* Make sv_2pv_flags return NULL if something is undefined. */ #define SV_UNDEF_RETURNS_NULL 2048 +/* Tell sv_utf8_upgrade() to not check to see if an upgrade is really needed. + * This is used when the caller has already determined it is, and avoids + * redundant work */ +#define SV_FORCE_UTF8_UPGRADE 4096 /* The core is safe for this COW optimisation. XS code on CPAN may not be. So only default to doing the COW setup if we're in the core. @@ -1775,6 +1779,7 @@ mg.c:1024: warning: left-hand operand of comma expression has no effect #define sv_pvbyte(sv) SvPVbyte_nolen(sv) #define sv_pvn_force_nomg(sv, lp) sv_pvn_force_flags(sv, lp, 0) +#define sv_utf8_upgrade_flags(sv, flags) sv_utf8_upgrade_flags_grow(sv, flags, 0) #define sv_utf8_upgrade_nomg(sv) sv_utf8_upgrade_flags(sv, 0) #define sv_catpvn_nomg(dsv, sstr, slen) sv_catpvn_flags(dsv, sstr, slen, 0) #define sv_setsv(dsv, ssv) \ diff --git a/utf8.h b/utf8.h index e8efd14..f9f1bec 100644 --- a/utf8.h +++ b/utf8.h @@ -51,6 +51,7 @@ END_EXTERN_C /* Native character to iso-8859-1 */ #define NATIVE_TO_ASCII(ch) (ch) +#define NATIVE8_TO_UNI(ch) (ch) #define ASCII_TO_NATIVE(ch) (ch) /* Transform after encoding */ #define NATIVE_TO_UTF(ch) (ch) @@ -213,7 +214,7 @@ encoded character. #define UNICODE_ILLEGAL 0xFFFF /* Though our UTF-8 encoding can go beyond this, - * let's be conservative and do as Unicode 3.2 says. */ + * let's be conservative and do as Unicode 5.1 says. */ #define PERL_UNICODE_MAX 0x10FFFF #define UNICODE_ALLOW_SURROGATE 0x0001 /* Allow UTF-16 surrogates (EVIL) */ diff --git a/utfebcdic.h b/utfebcdic.h index bb88571..7ed9375 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -29,7 +29,7 @@ * in I8, far beyond the current Unicode standard's * max, as shown in the comment later in this file.) * 3) Use the table published in tr16 to convert each byte from step 2 into - * final UTF-EBCDIC. The table in this file is PL_utf2e, and its invverse + * final UTF-EBCDIC. The table in this file is PL_utf2e, and its inverse * is PL_e2utf. They are constructed so that all EBCDIC invariants remain * invariant, but no others do. For example, the ordinal value of 'A' is * 193 in EBCDIC, and also is 193 in UTF-EBCDIC. Step 1) converts it to @@ -406,6 +406,7 @@ END_EXTERN_C /* Native to iso-8859-1 */ #define NATIVE_TO_ASCII(ch) PL_e2a[(U8)(ch)] +#define NATIVE8_TO_UNI(ch) NATIVE_TO_ASCII(ch) /* synonym */ #define ASCII_TO_NATIVE(ch) PL_a2e[(U8)(ch)] /* Transform after encoding */ #define NATIVE_TO_UTF(ch) PL_e2utf[(U8)(ch)] @@ -461,7 +462,7 @@ END_EXTERN_C #define UNI_IS_INVARIANT(c) ((c) < 0xA0) /* UTF-EBCDIC sematic macros - transform back into UTF-8-Mod and then compare */ -#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_ASCII(c)) +#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE8_TO_UNI(c)) #define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c)) #define UTF8_IS_START(c) (NATIVE_TO_UTF(c) >= 0xA0 && (NATIVE_TO_UTF(c) & 0xE0) != 0xA0) #define UTF8_IS_CONTINUATION(c) ((NATIVE_TO_UTF(c) & 0xE0) == 0xA0) -- 1.8.3.1