From f34acfecc286f2eff2450db713da005d888a7317 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 4 Nov 2019 21:30:48 -0700 Subject: [PATCH] Reimplement tr/// without swashes This large commit removes the last use of swashes from core. It replaces swashes by inversion maps. This data structure is already in use for some Unicode properties, such as case changing. The inversion map data structure leads to straight forward implementation code, so I collapsed the two doop.c routines do_trans_complex_utf8() and do_trans_simple_utf8() into one. A few conditionals could be avoided in the loop if this function were split so that one version didn't have to test for, e.g., squashing, but I suspect these are in the noise in the loop, which has to deal with UTF-8 conversions. This should be faster than the previous implementation anyway. I measured the differences some releases back, and inversion maps were faster than the equivalent swash for up to 512 or 1024 different ranges. These numbers are unlikely to be exceeded in tr/// except possibly in machine-generated ones. Inversion maps are capable of handling both UTF-8 and non-UTF-8 cases, but I left in the existing non-UTF-8 implementation, which uses tables, because I suspect it is faster. This means that there is extra code, purely for runtime performance. An inversion map is always created from the input, and then if the table implementation is to be used, the table is easily derived from the map. Prior to this commit, the table implementation was used in certain edge cases involving code points above 255. Those cases are now handled by the inversion map implementation, because it would have taken extra code to detect them, and I didn't think it was worth it. That could be changed if I am wrong. Creating an inversion map for all inputs essentially normalizes them, and then the same logic is usable for all. This fixes some false negatives in the previous implementation. It also allows for detecting if the actual transliteration can be done in place. Previously, the code mostly punted on that detection for the UTF-8 case. This also allows for accurate counting of the lengths of the two sides, fixing some longstanding TODO warning tests. A new flag is created, OPpTRANS_CAN_FORCE_UTF8, when the tr/// has a below 256 character resolving to one that requires UTF-8. If this isn't set, the code knows that a non-UTF-8 input won't become UTF-8 in the process, and so can take short cuts. The bit representing this flag is the same as OPpTRANS_FROM_UTF, which is no longer used. That name is left in so that the dozen-ish modules in cpan that refer to it can still compile. AFAICT none of them actually use the flag, as well they shouldn't since it is private to the core. Inversion maps are ideally suited for tr/// implementations. An issue with them in general is that for some pathological data, they can become fragmented requiring more space than you would expect, to represent the underlying data. However, the typical tr/// would not have this issue, requiring only very short inversion maps to represent; in some cases shorter than the table implementation. Inversion maps are also easier to deparse than swashes. A deparse TODO was also fixed by this commit, and the code to deparse UTF-8 inputs is simplified. One could implement specialized data structures for specific types of inputs. For example, a common tr/// form is a single range, like tr/A-Z/a-z/. That could be implemented without a table and be quite fast. An intermediate step would be to use the inversion map implementation always when the transliteration is a single range, and then special case length=1 maps at execution time. Thanks to Nicholas Rochemagne for his help on B --- doop.c | 506 +++++++++------------ dump.c | 13 +- embed.fnc | 10 +- embed.h | 25 +- invlist_inline.h | 5 +- lib/B/Deparse.pm | 163 +++---- op.c | 1310 +++++++++++++++++++++++++++++++++++++++--------------- op.h | 2 +- proto.h | 119 +++-- toke.c | 22 +- 10 files changed, 1341 insertions(+), 834 deletions(-) diff --git a/doop.c b/doop.c index e0d63f1..3cb1354 100644 --- a/doop.c +++ b/doop.c @@ -22,6 +22,7 @@ #include "EXTERN.h" #define PERL_IN_DOOP_C #include "perl.h" +#include "invlist_inline.h" #ifndef PERL_MICRO #include @@ -297,328 +298,240 @@ S_do_trans_complex(pTHX_ SV * const sv, const OPtrans_map * const tbl) /* Helper function for do_trans(). - * Handles utf8 cases(*) not involving the /c, /d, /s flags, - * and where search and replacement charlists aren't identical. - * (*) i.e. where the search or replacement charlists are utf8. sv may - * or may not be utf8. + * Handles cases where an inversion map implementation is to be used and the + * search and replacement charlists are identical: so the string isn't + * modified, and only a count of modifiable chars is needed. + * + * Note that it doesn't handle /d nor /s, since these modify the string + * even if the replacement charlist is empty. + * + * sv may or may not be utf8. */ STATIC Size_t -S_do_trans_simple_utf8(pTHX_ SV * const sv) +S_do_trans_count_invmap(pTHX_ SV * const sv, AV * const invmap) { U8 *s; U8 *send; - U8 *d; - U8 *start; - U8 *dstart, *dend; Size_t matches = 0; - const bool grows = cBOOL(PL_op->op_private & OPpTRANS_GROWS); STRLEN len; - SV* const rv = -#ifdef USE_ITHREADS - PAD_SVl(cPADOP->op_padix); -#else - MUTABLE_SV(cSVOP->op_sv); -#endif - HV* const hv = MUTABLE_HV(SvRV(rv)); - SV* const * svp = hv_fetchs(hv, "NONE", FALSE); - const UV none = svp ? SvUV(*svp) : 0x7fffffff; - const UV extra = none + 1; - UV final = 0; - U8 hibit = 0; + SV** const from_invlist_ptr = av_fetch(invmap, 0, TRUE); + SV** const to_invmap_ptr = av_fetch(invmap, 1, TRUE); + SV* from_invlist = *from_invlist_ptr; + SV* to_invmap_sv = *to_invmap_ptr; + UV* map = (UV *) SvPVX(to_invmap_sv); - PERL_ARGS_ASSERT_DO_TRANS_SIMPLE_UTF8; + PERL_ARGS_ASSERT_DO_TRANS_COUNT_INVMAP; s = (U8*)SvPV_nomg(sv, len); - if (!SvUTF8(sv)) { - hibit = ! is_utf8_invariant_string(s, len); - if (hibit) { - s = bytes_to_utf8(s, &len); - } - } + send = s + len; - start = s; - svp = hv_fetchs(hv, "FINAL", FALSE); - if (svp) - final = SvUV(*svp); + while (s < send) { + UV from; + SSize_t i; + STRLEN s_len; + + /* Get the code point of the next character in the string */ + if (! SvUTF8(sv) || UTF8_IS_INVARIANT(*s)) { + from = *s; + s_len = 1; + } + else { + from = utf8_to_uvchr_buf(s, send, &s_len); + if (from == 0 && *s != '\0') { + _force_out_malformed_utf8_message(s, send, 0, /*die*/TRUE); + } + } - if (grows) { - /* d needs to be bigger than s, in case e.g. upgrading is required */ - Newx(d, len * 3 + UTF8_MAXBYTES, U8); - dend = d + len * 3; - dstart = d; - } - else { - dstart = d = s; - dend = d + len; - } + /* Look the code point up in the data structure for this tr/// to get + * what it maps to */ + i = _invlist_search(from_invlist, from); + assert(i >= 0); - while (s < send) { - const UV uv = swash_fetch(rv, s, TRUE); - if (uv < none) { - s += UTF8SKIP(s); - matches++; - d = uvchr_to_utf8(d, uv); - } - else if (uv == none) { - const int i = UTF8SKIP(s); - Move(s, d, i, U8); - d += i; - s += i; - } - else if (uv == extra) { - s += UTF8SKIP(s); - matches++; - d = uvchr_to_utf8(d, final); - } - else - s += UTF8SKIP(s); - - if (d > dend) { - const STRLEN clen = d - dstart; - const STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES; - if (!grows) - Perl_croak(aTHX_ "panic: do_trans_simple_utf8 line %d",__LINE__); - Renew(dstart, nlen + UTF8_MAXBYTES, U8); - d = dstart + clen; - dend = dstart + nlen; - } - } - if (grows || hibit) { - sv_setpvn(sv, (char*)dstart, d - dstart); - Safefree(dstart); - if (grows && hibit) - Safefree(start); - } - else { - *d = '\0'; - SvCUR_set(sv, d - dstart); + if (map[i] != (UV) TR_UNLISTED) { + matches++; + } + + s += s_len; } - SvSETMAGIC(sv); - SvUTF8_on(sv); return matches; } /* Helper function for do_trans(). - * Handles utf8 cases(*) where search and replacement charlists are - * identical: so the string isn't modified, and only a count of modifiable - * chars is needed. - * Note that it doesn't handle /d or /s, since these modify the string - * even if the replacement charlist is empty. - * (*) i.e. where the search or replacement charlists are utf8. sv may - * or may not be utf8. + * Handles cases where an inversion map implementation is to be used and the + * search and replacement charlists are either not identical or flags are + * present. + * + * sv may or may not be utf8. */ STATIC Size_t -S_do_trans_count_utf8(pTHX_ SV * const sv) +S_do_trans_invmap(pTHX_ SV * const sv, AV * const invmap) { - const U8 *s; - const U8 *start = NULL; - const U8 *send; + U8 *s; + U8 *send; + U8 *d; + U8 *s0; + U8 *d0; Size_t matches = 0; STRLEN len; - SV* const rv = -#ifdef USE_ITHREADS - PAD_SVl(cPADOP->op_padix); -#else - MUTABLE_SV(cSVOP->op_sv); -#endif - HV* const hv = MUTABLE_HV(SvRV(rv)); - SV* const * const svp = hv_fetchs(hv, "NONE", FALSE); - const UV none = svp ? SvUV(*svp) : 0x7fffffff; - const UV extra = none + 1; - U8 hibit = 0; - - PERL_ARGS_ASSERT_DO_TRANS_COUNT_UTF8; + SV** const from_invlist_ptr = av_fetch(invmap, 0, TRUE); + SV** const to_invmap_ptr = av_fetch(invmap, 1, TRUE); + SV** const to_expansion_ptr = av_fetch(invmap, 2, TRUE); + NV max_expansion = SvNV(*to_expansion_ptr); + SV* from_invlist = *from_invlist_ptr; + SV* to_invmap_sv = *to_invmap_ptr; + UV* map = (UV *) SvPVX(to_invmap_sv); + UV previous_map = TR_OOB; + const bool squash = cBOOL(PL_op->op_private & OPpTRANS_SQUASH); + const bool delete_unfound = cBOOL(PL_op->op_private & OPpTRANS_DELETE); + bool inplace = ! cBOOL(PL_op->op_private & OPpTRANS_GROWS); + const UV* from_array = invlist_array(from_invlist); + UV final_map; + bool out_is_utf8 = SvUTF8(sv); + STRLEN s_len; + + PERL_ARGS_ASSERT_DO_TRANS_INVMAP; + + /* A third element in the array indicates that the replacement list was + * shorter than the search list, and this element contains the value to use + * for the items that don't correspond */ + if (av_top_index(invmap) >= 3) { + SV** const final_map_ptr = av_fetch(invmap, 3, TRUE); + SV* const final_map_sv = *final_map_ptr; + final_map = SvUV(final_map_sv); + } - s = (const U8*)SvPV_nomg_const(sv, len); - if (!SvUTF8(sv)) { - hibit = ! is_utf8_invariant_string(s, len); - if (hibit) { - start = s = bytes_to_utf8(s, &len); - } + /* If there is something in the transliteration that could force the input + * to be changed to UTF-8, we don't know if we can do it in place, so + * assume cannot */ + if (! out_is_utf8 && (PL_op->op_private & OPpTRANS_CAN_FORCE_UTF8)) { + inplace = FALSE; + if (max_expansion < 2) { + max_expansion = 2; + } } + + s = (U8*)SvPV_nomg(sv, len); send = s + len; + s0 = s; - while (s < send) { - const UV uv = swash_fetch(rv, s, TRUE); - if (uv < none || uv == extra) - matches++; - s += UTF8SKIP(s); + /* We know by now if there are some possible input strings whose + * transliterations are longer than the input. If none can, we just edit + * in place. */ + if (inplace) { + d0 = d = s; + } + else { + /* Here, we can't edit in place. We have no idea how much, if any, + * this particular input string will grow. However, the compilation + * calculated the maximum expansion possible. Use that to allocale + * based on the worst case scenario. */ + Newx(d, len * max_expansion + 1, U8); + d0 = d; } - if (hibit) - Safefree(start); - return matches; -} + restart: + /* Do the actual transliteration */ + while (s < send) { + UV from; + UV to; + SSize_t i; + STRLEN s_len; + + /* Get the code point of the next character in the string */ + if (! SvUTF8(sv) || UTF8_IS_INVARIANT(*s)) { + from = *s; + s_len = 1; + } + else { + from = utf8_to_uvchr_buf(s, send, &s_len); + if (from == 0 && *s != '\0') { + _force_out_malformed_utf8_message(s, send, 0, /*die*/TRUE); + } + } -/* Helper function for do_trans(). - * Handles utf8 cases(*) involving the /c, /d, /s flags, - * and where search and replacement charlists aren't identical. - * (*) i.e. where the search or replacement charlists are utf8. sv may - * or may not be utf8. - */ + /* Look the code point up in the data structure for this tr/// to get + * what it maps to */ + i = _invlist_search(from_invlist, from); + assert(i >= 0); -STATIC Size_t -S_do_trans_complex_utf8(pTHX_ SV * const sv) -{ - U8 *start, *send; - U8 *d; - Size_t matches = 0; - const bool squash = cBOOL(PL_op->op_private & OPpTRANS_SQUASH); - const bool del = cBOOL(PL_op->op_private & OPpTRANS_DELETE); - const bool grows = cBOOL(PL_op->op_private & OPpTRANS_GROWS); - SV* const rv = -#ifdef USE_ITHREADS - PAD_SVl(cPADOP->op_padix); -#else - MUTABLE_SV(cSVOP->op_sv); -#endif - HV * const hv = MUTABLE_HV(SvRV(rv)); - SV * const *svp = hv_fetchs(hv, "NONE", FALSE); - const UV none = svp ? SvUV(*svp) : 0x7fffffff; - const UV extra = none + 1; - UV final = 0; - bool havefinal = FALSE; - STRLEN len; - U8 *dstart, *dend; - U8 hibit = 0; - U8 *s = (U8*)SvPV_nomg(sv, len); + to = map[i]; - PERL_ARGS_ASSERT_DO_TRANS_COMPLEX_UTF8; + if (to == (UV) TR_UNLISTED) { /* Just copy the unreplaced character */ + if (UVCHR_IS_INVARIANT(from) || ! out_is_utf8) { + *d++ = from; + } + else if (SvUTF8(sv)) { + Move(s, d, s_len, U8); + d += s_len; + } + else { /* Convert to UTF-8 */ + append_utf8_from_native_byte(*s, &d); + } - if (!SvUTF8(sv)) { - hibit = ! is_utf8_invariant_string(s, len); - if (hibit) { - s = bytes_to_utf8(s, &len); + previous_map = to; + s += s_len; + continue; } - } - send = s + len; - start = s; - svp = hv_fetchs(hv, "FINAL", FALSE); - if (svp) { - final = SvUV(*svp); - havefinal = TRUE; - } + /* Everything else is counted as a match */ + matches++; - if (grows) { - /* d needs to be bigger than s, in case e.g. upgrading is required */ - Newx(d, len * 3 + UTF8_MAXBYTES, U8); - dend = d + len * 3; - dstart = d; - } - else { - dstart = d = s; - dend = d + len; - } + if (to == (UV) TR_SPECIAL_HANDLING) { + if (delete_unfound) { + previous_map = to; + s += s_len; + continue; + } - if (squash) { - UV puv = 0xfeedface; - while (s < send) { - UV uv = swash_fetch(rv, s, TRUE); - - if (d > dend) { - const STRLEN clen = d - dstart; - const STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES; - if (!grows) - Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__); - Renew(dstart, nlen + UTF8_MAXBYTES, U8); - d = dstart + clen; - dend = dstart + nlen; - } - if (uv < none) { - matches++; - s += UTF8SKIP(s); - if (uv != puv) { - d = uvchr_to_utf8(d, uv); - puv = uv; - } - continue; - } - else if (uv == none) { /* "none" is unmapped character */ - const int i = UTF8SKIP(s); - Move(s, d, i, U8); - d += i; - s += i; - puv = 0xfeedface; - continue; - } - else if (uv == extra && !del) { - matches++; - if (havefinal) { - s += UTF8SKIP(s); - if (puv != final) { - d = uvchr_to_utf8(d, final); - puv = final; - } - } - else { - STRLEN len; - uv = utf8n_to_uvchr(s, send - s, &len, UTF8_ALLOW_DEFAULT); - if (uv != puv) { - Move(s, d, len, U8); - d += len; - puv = uv; - } - s += len; - } - continue; - } - matches++; /* "none+1" is delete character */ - s += UTF8SKIP(s); - } - } - else { - while (s < send) { - const UV uv = swash_fetch(rv, s, TRUE); - if (d > dend) { - const STRLEN clen = d - dstart; - const STRLEN nlen = dend - dstart + len + UTF8_MAXBYTES; - if (!grows) - Perl_croak(aTHX_ "panic: do_trans_complex_utf8 line %d",__LINE__); - Renew(dstart, nlen + UTF8_MAXBYTES, U8); - d = dstart + clen; - dend = dstart + nlen; - } - if (uv < none) { - matches++; - s += UTF8SKIP(s); - d = uvchr_to_utf8(d, uv); - continue; - } - else if (uv == none) { /* "none" is unmapped character */ - const int i = UTF8SKIP(s); - Move(s, d, i, U8); - d += i; - s += i; - continue; - } - else if (uv == extra && !del) { - matches++; - s += UTF8SKIP(s); - d = uvchr_to_utf8(d, final); - continue; - } - matches++; /* "none+1" is delete character */ - s += UTF8SKIP(s); - } + /* Use the final character in the replacement list */ + to = final_map; + } + else { /* Here the input code point is to be remapped. The actual + value is offset from the base of this entry */ + to += from - from_array[i]; + } + + /* If copying all occurrences, or this is the first occurrence, copy it + * to the output */ + if (! squash || to != previous_map) { + if (out_is_utf8) { + d = uvchr_to_utf8(d, to); + } + else { + if (to >= 256) { /* If need to convert to UTF-8, restart */ + out_is_utf8 = TRUE; + s = s0; + d = d0; + matches = 0; + goto restart; + } + *d++ = to; + } + } + + previous_map = to; + s += s_len; } - if (grows || hibit) { - sv_setpvn(sv, (char*)dstart, d - dstart); - Safefree(dstart); - if (grows && hibit) - Safefree(start); + + s_len = 0; + s += s_len; + if (! inplace) { + sv_setpvn(sv, (char*)d0, d - d0); } else { *d = '\0'; - SvCUR_set(sv, d - dstart); + SvCUR_set(sv, d - d0); + } + + if (! SvUTF8(sv) && out_is_utf8) { + SvUTF8_on(sv); } - SvUTF8_on(sv); SvSETMAGIC(sv); return matches; @@ -627,7 +540,8 @@ S_do_trans_complex_utf8(pTHX_ SV * const sv) /* Execute a tr//. sv is the value to be translated, while PL_op * should be an OP_TRANS or OP_TRANSR op, whose op_pv field contains a - * translation table or whose op_sv field contains a swash. + * translation table or whose op_sv field contains an inversion map. + * * Returns a count of number of characters translated */ @@ -636,31 +550,49 @@ Perl_do_trans(pTHX_ SV *sv) { STRLEN len; const U8 flags = PL_op->op_private; - const U8 hasutf = flags & (OPpTRANS_FROM_UTF | OPpTRANS_TO_UTF); + bool use_utf8_fcns = cBOOL(flags & OPpTRANS_USE_SVOP); + bool identical = cBOOL(flags & OPpTRANS_IDENTICAL); PERL_ARGS_ASSERT_DO_TRANS; - if (SvREADONLY(sv) && !(flags & OPpTRANS_IDENTICAL)) { + if (SvREADONLY(sv) && ! identical) { Perl_croak_no_modify(); } (void)SvPV_const(sv, len); if (!len) return 0; - if (!(flags & OPpTRANS_IDENTICAL)) { + if (! identical) { if (!SvPOKp(sv) || SvTHINKFIRST(sv)) (void)SvPV_force_nomg(sv, len); (void)SvPOK_only_UTF8(sv); } - /* If we use only OPpTRANS_IDENTICAL to bypass the READONLY check, - * we must also rely on it to choose the readonly strategy. - */ - if (flags & OPpTRANS_IDENTICAL) { - return hasutf ? do_trans_count_utf8(sv) : do_trans_count(sv, (OPtrans_map*)cPVOP->op_pv); - } else if (flags & (OPpTRANS_SQUASH|OPpTRANS_DELETE|OPpTRANS_COMPLEMENT)) { - return hasutf ? do_trans_complex_utf8(sv) : do_trans_complex(sv, (OPtrans_map*)cPVOP->op_pv); - } else { - return hasutf ? do_trans_simple_utf8(sv) : do_trans_simple(sv, (OPtrans_map*)cPVOP->op_pv); + if (use_utf8_fcns) { + SV* const map = +#ifdef USE_ITHREADS + PAD_SVl(cPADOP->op_padix); +#else + MUTABLE_SV(cSVOP->op_sv); +#endif + + if (identical) { + return do_trans_count_invmap(sv, (AV *) map); + } + else { + return do_trans_invmap(sv, (AV *) map); + } + } + else { + const OPtrans_map * const map = (OPtrans_map*)cPVOP->op_pv; + + if (identical) { + return do_trans_count(sv, map); + } + else if (flags & (OPpTRANS_SQUASH|OPpTRANS_DELETE|OPpTRANS_COMPLEMENT)) { + return do_trans_complex(sv, map); + } + else + return do_trans_simple(sv, map); } } diff --git a/dump.c b/dump.c index 78c151d..f03c3f6 100644 --- a/dump.c +++ b/dump.c @@ -1305,13 +1305,13 @@ S_do_op_dump_bar(pTHX_ I32 level, UV bar, PerlIO *file, const OP *o) case OP_TRANS: case OP_TRANSR: - if (o->op_private & (OPpTRANS_FROM_UTF | OPpTRANS_TO_UTF)) { - /* utf8: table stored as a swash */ + if (o->op_private & OPpTRANS_USE_SVOP) { + /* utf8: table stored as an inversion map */ #ifndef USE_ITHREADS - /* with ITHREADS, swash is stored in the pad, and the right pad + /* with ITHREADS, it is stored in the pad, and the right pad * may not be active here, so skip */ S_opdump_indent(aTHX_ o, level, bar, file, - "SWASH = 0x%" UVxf "\n", + "INVMAP = 0x%" UVxf "\n", PTR2UV(MUTABLE_SV(cSVOPo->op_sv))); #endif } @@ -2986,11 +2986,10 @@ Perl_op_class(pTHX_ const OP *o) * pointer to a table of shorts used to look up translations. * Under utf8, however, a simple table isn't practical; instead, * the OP is an SVOP (or, under threads, a PADOP), - * and the SV is a reference to a swash - * (i.e., an RV pointing to an HV). + * and the SV is an AV. */ return (!custom && - (o->op_private & (OPpTRANS_TO_UTF|OPpTRANS_FROM_UTF)) + (o->op_private & OPpTRANS_USE_SVOP) ) #if defined(USE_ITHREADS) ? OPclass_PADOP : OPclass_PVOP; diff --git a/embed.fnc b/embed.fnc index 87c5159..76ec0c0 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1870,7 +1870,7 @@ Apd |void |sv_vsetpvfn |NN SV *const sv|NN const char *const pat|const STRLEN pa ApR |NV |str_to_version |NN SV *sv EXpR |SV* |swash_init |NN const char* pkg|NN const char* name|NN SV* listsv|I32 minbits|I32 none EXp |UV |swash_fetch |NN SV *swash|NN const U8 *ptr|bool do_utf8 -#if defined(PERL_IN_REGCOMP_C) +#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) EiR |SV* |add_cp_to_invlist |NULLOK SV* invlist|const UV cp Ei |void |invlist_extend |NN SV* const invlist|const UV len Ei |void |invlist_set_len|NN SV* const invlist|const UV len|const bool offset @@ -1922,7 +1922,8 @@ EpX |SV* |invlist_clone |NN SV* const invlist|NULLOK SV* newlist #endif #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) \ || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) \ - || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) + || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) \ + || defined(PERL_IN_DOOP_C) EiRT |UV* |invlist_array |NN SV* const invlist EiRT |bool |is_invlist |NULLOK SV* const invlist EiRT |bool* |get_invlist_offset_addr|NN SV* invlist @@ -2308,9 +2309,8 @@ p |void |init_constants SR |Size_t |do_trans_simple |NN SV * const sv|NN const OPtrans_map * const tbl SR |Size_t |do_trans_count |NN SV * const sv|NN const OPtrans_map * const tbl SR |Size_t |do_trans_complex |NN SV * const sv|NN const OPtrans_map * const tbl -SR |Size_t |do_trans_simple_utf8 |NN SV * const sv -SR |Size_t |do_trans_count_utf8 |NN SV * const sv -SR |Size_t |do_trans_complex_utf8 |NN SV * const sv +SR |Size_t |do_trans_invmap |NN SV * const sv|NN AV * const map +SR |Size_t |do_trans_count_invmap |NN SV * const sv|NN AV * const map #endif #if defined(PERL_IN_GV_C) diff --git a/embed.h b/embed.h index 425ba30..5a1c6fe 100644 --- a/embed.h +++ b/embed.h @@ -1014,7 +1014,6 @@ # endif # if defined(PERL_IN_REGCOMP_C) #define add_above_Latin1_folds(a,b,c) S_add_above_Latin1_folds(aTHX_ a,b,c) -#define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b) #define add_data S_add_data #define add_multi_match(a,b,c) S_add_multi_match(aTHX_ a,b,c) #define change_engine_size(a,b) S_change_engine_size(aTHX_ a,b) @@ -1024,20 +1023,13 @@ #define edit_distance S_edit_distance #define get_ANYOFM_contents(a) S_get_ANYOFM_contents(aTHX_ a) #define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b) -#define get_invlist_iter_addr S_get_invlist_iter_addr #define grok_bslash_N(a,b,c,d,e,f,g) S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g) #define handle_named_backref(a,b,c,d) S_handle_named_backref(aTHX_ a,b,c,d) #define handle_possible_posix(a,b,c,d,e) S_handle_possible_posix(aTHX_ a,b,c,d,e) #define handle_regex_sets(a,b,c,d,e) S_handle_regex_sets(aTHX_ a,b,c,d,e) #define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j) Perl_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j) #define invlist_contents(a,b) S_invlist_contents(aTHX_ a,b) -#define invlist_extend(a,b) S_invlist_extend(aTHX_ a,b) -#define invlist_highest S_invlist_highest #define invlist_is_iterating S_invlist_is_iterating -#define invlist_iterfinish S_invlist_iterfinish -#define invlist_iterinit S_invlist_iterinit -#define invlist_iternext S_invlist_iternext -#define invlist_set_len(a,b,c) S_invlist_set_len(aTHX_ a,b,c) #define is_ssc_worth_it S_is_ssc_worth_it #define join_exact(a,b,c,d,e,f,g) S_join_exact(aTHX_ a,b,c,d,e,f,g) #define make_exactf_invlist(a,b) S_make_exactf_invlist(aTHX_ a,b) @@ -1083,6 +1075,16 @@ # if defined(PERL_IN_REGCOMP_C) || defined (PERL_IN_DUMP_C) || defined(PERL_IN_OP_C) #define _invlist_dump(a,b,c,d) Perl__invlist_dump(aTHX_ a,b,c,d) # endif +# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) +#define add_cp_to_invlist(a,b) S_add_cp_to_invlist(aTHX_ a,b) +#define get_invlist_iter_addr S_get_invlist_iter_addr +#define invlist_extend(a,b) S_invlist_extend(aTHX_ a,b) +#define invlist_highest S_invlist_highest +#define invlist_iterfinish S_invlist_iterfinish +#define invlist_iterinit S_invlist_iterinit +#define invlist_iternext S_invlist_iternext +#define invlist_set_len(a,b,c) S_invlist_set_len(aTHX_ a,b,c) +# endif # if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C) #define _invlistEQ(a,b,c) Perl__invlistEQ(aTHX_ a,b,c) #define _new_invlist_C_array(a) Perl__new_invlist_C_array(aTHX_ a) @@ -1094,7 +1096,7 @@ #endif #define regprop(a,b,c,d,e) Perl_regprop(aTHX_ a,b,c,d,e) # endif -# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) +# if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) #define _invlist_contains_cp S__invlist_contains_cp #define _invlist_len S__invlist_len #define _invlist_search Perl__invlist_search @@ -1603,11 +1605,10 @@ # endif # if defined(PERL_IN_DOOP_C) #define do_trans_complex(a,b) S_do_trans_complex(aTHX_ a,b) -#define do_trans_complex_utf8(a) S_do_trans_complex_utf8(aTHX_ a) #define do_trans_count(a,b) S_do_trans_count(aTHX_ a,b) -#define do_trans_count_utf8(a) S_do_trans_count_utf8(aTHX_ a) +#define do_trans_count_invmap(a,b) S_do_trans_count_invmap(aTHX_ a,b) +#define do_trans_invmap(a,b) S_do_trans_invmap(aTHX_ a,b) #define do_trans_simple(a,b) S_do_trans_simple(aTHX_ a,b) -#define do_trans_simple_utf8(a) S_do_trans_simple_utf8(aTHX_ a) # endif # if defined(PERL_IN_DUMP_C) #define deb_curcv(a) S_deb_curcv(aTHX_ a) diff --git a/invlist_inline.h b/invlist_inline.h index 76d6dda..33f8aee 100644 --- a/invlist_inline.h +++ b/invlist_inline.h @@ -14,7 +14,8 @@ || defined(PERL_IN_REGEXEC_C) \ || defined(PERL_IN_TOKE_C) \ || defined(PERL_IN_PP_C) \ - || defined(PERL_IN_OP_C) + || defined(PERL_IN_OP_C) \ + || defined(PERL_IN_DOOP_C) /* An element is in an inversion list iff its index is even numbered: 0, 2, 4, * etc */ @@ -92,7 +93,7 @@ S_invlist_array(SV* const invlist) } #endif -#if defined(PERL_IN_REGCOMP_C) +#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) PERL_STATIC_INLINE void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) diff --git a/lib/B/Deparse.pm b/lib/B/Deparse.pm index 88555ff..1ae4619 100644 --- a/lib/B/Deparse.pm +++ b/lib/B/Deparse.pm @@ -279,6 +279,8 @@ BEGIN { for (qw[ const stringify rv2sv list glob pushmark null aelem # _pessimise_walk(): recursively walk the optree of a sub, # possibly undoing optimisations along the way. +sub DEBUG { 0 } + sub _pessimise_walk { my ($self, $startop) = @_; @@ -5714,100 +5716,81 @@ sub tr_chr { } } -# XXX This doesn't yet handle all cases correctly either +sub tr_invmap { + my ($invlist_ref, $map_ref) = @_; -sub tr_decode_utf8 { - my($swash_hv, $flags) = @_; - my %swash = $swash_hv->ARRAY; - my $final = undef; - $final = $swash{'FINAL'}->IV if exists $swash{'FINAL'}; - my $none = $swash{"NONE"}->IV; - my $extra = $none + 1; - my(@from, @delfrom, @to); - my $line; - foreach $line (split /\n/, $swash{'LIST'}->PV) { - my($min, $max, $result) = split(/\t/, $line); - $min = hex $min; - if (length $max) { - $max = hex $max; - } else { - $max = $min; - } - $result = hex $result; - if ($result == $extra) { - push @delfrom, [$min, $max]; - } else { - push @from, [$min, $max]; - push @to, [$result, $result + $max - $min]; - } - } - for my $i (0 .. $#from) { - if ($from[$i][0] == ord '-') { - unshift @from, splice(@from, $i, 1); - unshift @to, splice(@to, $i, 1); - last; - } elsif ($from[$i][1] == ord '-') { - $from[$i][1]--; - $to[$i][1]--; - unshift @from, ord '-'; - unshift @to, ord '-'; - last; - } - } - for my $i (0 .. $#delfrom) { - if ($delfrom[$i][0] == ord '-') { - push @delfrom, splice(@delfrom, $i, 1); - last; - } elsif ($delfrom[$i][1] == ord '-') { - $delfrom[$i][1]--; - push @delfrom, ord '-'; - last; - } + my $infinity = ~0 >> 1; # IV_MAX + my $from = ""; + my $to = ""; + + for my $i (0.. @$invlist_ref - 1) { + my $this_from = $invlist_ref->[$i]; + my $map = $map_ref->[$i]; + my $upper = ($i < @$invlist_ref - 1) + ? $invlist_ref->[$i+1] + : $infinity; + my $range = $upper - $this_from - 1; + if (DEBUG) { + print STDERR "i=$i, from=$this_from, upper=$upper, range=$range\n"; + } + next if $map == ~0; + next if $map == ~0 - 1; + $from .= tr_chr($this_from); + $to .= tr_chr($map); + next if $range == 0; # Single code point + if ($range == 1) { # Adjacent code points + $from .= tr_chr($this_from + 1); + $to .= tr_chr($map + 1); + } + elsif ($upper != $infinity) { + $from .= "-" . tr_chr($this_from + $range); + $to .= "-" . tr_chr($map + $range); + } + else { + $from .= "-INFTY"; + $to .= "-INFTY"; + } } - if (defined $final and $to[$#to][1] != $final) { - push @to, [$final, $final]; + + return ($from, $to); +} + +sub tr_decode_utf8 { + my($tr_av, $flags) = @_; + printf STDERR "flags=0x%x\n", $flags if DEBUG; + my $invlist = $tr_av->ARRAYelt(0); + my @invlist = unpack("J*", $invlist->PV); + my @map = unpack("J*", $tr_av->ARRAYelt(1)->PV); + + if (DEBUG) { + for my $i (0 .. @invlist - 1) { + printf STDERR "[%d]\t%x\t", $i, $invlist[$i]; + my $map = $map[$i]; + if ($map == ~0) { + print STDERR "TR_UNMAPPED\n"; + } + elsif ($map == ~0 - 1) { + print STDERR "TR_SPECIAL\n"; + } + else { + printf STDERR "%x\n", $map; + } + } } - push @from, @delfrom; + + my ($from, $to) = tr_invmap(\@invlist, \@map); + if ($flags & OPpTRANS_COMPLEMENT) { - my @newfrom; - my $next = 0; - for my $i (0 .. $#from) { - push @newfrom, [$next, $from[$i][0] - 1]; - $next = $from[$i][1] + 1; - } - @from = (); - for my $range (@newfrom) { - if ($range->[0] <= $range->[1]) { - push @from, $range; - } - } - } - my($from, $to, $diff); - for my $chunk (@from) { - $diff = $chunk->[1] - $chunk->[0]; - if ($diff > 1) { - $from .= tr_chr($chunk->[0]) . "-" . tr_chr($chunk->[1]); - } elsif ($diff == 1) { - $from .= tr_chr($chunk->[0]) . tr_chr($chunk->[1]); - } else { - $from .= tr_chr($chunk->[0]); - } + shift @map; + pop @invlist; + my $throw_away; + ($from, $throw_away) = tr_invmap(\@invlist, \@map); } - for my $chunk (@to) { - $diff = $chunk->[1] - $chunk->[0]; - if ($diff > 1) { - $to .= tr_chr($chunk->[0]) . "-" . tr_chr($chunk->[1]); - } elsif ($diff == 1) { - $to .= tr_chr($chunk->[0]) . tr_chr($chunk->[1]); - } else { - $to .= tr_chr($chunk->[0]); - } + + if (DEBUG) { + print STDERR "Returning ", escape_str($from), "/", + escape_str($to), "\n"; } - #$final = sprintf("%04x", $final) if defined $final; - #$none = sprintf("%04x", $none) if defined $none; - #$extra = sprintf("%04x", $extra) if defined $extra; - #print STDERR "final: $final\n none: $none\nextra: $extra\n"; - #print STDERR $swash{'LIST'}->PV; return (escape_str($from), escape_str($to)); } @@ -5821,9 +5804,9 @@ sub pp_trans { ($from, $to) = tr_decode_byte($op->pv, $priv_flags); } elsif ($class eq "PADOP") { ($from, $to) - = tr_decode_utf8($self->padval($op->padix)->RV, $priv_flags); + = tr_decode_utf8($self->padval($op->padix), $priv_flags); } else { # class($op) eq "SVOP" - ($from, $to) = tr_decode_utf8($op->sv->RV, $priv_flags); + ($from, $to) = tr_decode_utf8($op->sv, $priv_flags); } my $flags = ""; $flags .= "c" if $priv_flags & OPpTRANS_COMPLEMENT; diff --git a/op.c b/op.c index 12ee52a..91eb50a 100644 --- a/op.c +++ b/op.c @@ -1059,7 +1059,7 @@ Perl_op_clear(pTHX_ OP *o) case OP_TRANS: case OP_TRANSR: if ( (o->op_type == OP_TRANS || o->op_type == OP_TRANSR) - && (o->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))) + && (o->op_private & OPpTRANS_USE_SVOP)) { #ifdef USE_ITHREADS if (cPADOPo->op_padix > 0) { @@ -6784,8 +6784,8 @@ static int uvcompare(const void *a, const void *b) * OPpTRANS_SQUASH * OPpTRANS_DELETE * flags as appropriate; this function may add - * OPpTRANS_FROM_UTF - * OPpTRANS_TO_UTF + * OPpTRANS_USE_SVOP + * OPpTRANS_CAN_FORCE_UTF8 * OPpTRANS_IDENTICAL * OPpTRANS_GROWS * flags @@ -6794,416 +6794,1028 @@ static int uvcompare(const void *a, const void *b) static OP * S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl) { + /* This function compiles a tr///, from data gathered from toke.c, into a + * form suitable for use by do_trans() in doop.c at runtime. + * + * It first normalizes the data, while discarding extraneous inputs; then + * writes out the compiled data. The normalization allows for complete + * analysis, and avoids some false negatives and positives earlier versions + * of this code had. + * + * The normalization form is an inversion map (described below in detail). + * This is essentially the compiled form for tr///'s that require UTF-8, + * and its easy to use it to write the 257-byte table for tr///'s that + * don't need UTF-8. That table is identical to what's been in use for + * many perl versions, except that it doesn't handle some edge cases that + * it used to, involving code points above 255. The UTF-8 form now handles + * these. (This could be changed with extra coding should it shown to be + * desirable.) + * + * If the complement (/c) option is specified, the lhs string (tstr) is + * parsed into an inversion list. Complementing these is trivial. Then a + * complemented tstr is built from that, and used thenceforth. This hides + * the fact that it was complemented from almost all successive code. + * + * One of the important characteristics to know about the input is whether + * the transliteration may be done in place, or does a temporary need to be + * allocated, then copied. If the replacement for every character in every + * possible string takes up no more bytes than the the character it + * replaces, then it can be edited in place. Otherwise the replacement + * could "grow", depending on the strings being processed. Some inputs + * won't grow, and might even shrink under /d, but some inputs could grow, + * so we have to assume any given one might grow. On very long inputs, the + * temporary could eat up a lot of memory, so we want to avoid it if + * possible. For non-UTF-8 inputs, everything is single-byte, so can be + * edited in place, unless there is something in the pattern that could + * force it into UTF-8. The inversion map makes it feasible to determine + * this. Previous versions of this code pretty much punted on determining + * if UTF-8 could be edited in place. Now, this code is rigorous in making + * that determination. + * + * Another characteristic we need to know is whether the lhs and rhs are + * identical. If so, and no other flags are present, the only effect of + * the tr/// is to count the characters present in the input that are + * mentioned in the lhs string. The implementation of that is easier and + * runs faster than the more general case. Normalizing here allows for + * accurate determination of this. Previously there were false negatives + * possible. + * + * Instead of 'transliterated', the comments here use 'unmapped' for the + * characters that are left unchanged by the operation; otherwise they are + * 'mapped' + * + * The lhs of the tr/// is here referred to as the t side. + * The rhs of the tr/// is here referred to as the r side. + */ + SV * const tstr = ((SVOP*)expr)->op_sv; SV * const rstr = ((SVOP*)repl)->op_sv; STRLEN tlen; STRLEN rlen; - const U8 *t = (U8*)SvPV_const(tstr, tlen); - const U8 *r = (U8*)SvPV_const(rstr, rlen); - Size_t i, j; - bool grows = FALSE; - OPtrans_map *tbl; - SSize_t struct_size; /* malloced size of table struct */ - + const U8 * t0 = (U8*)SvPV_const(tstr, tlen); + const U8 * r0 = (U8*)SvPV_const(rstr, rlen); + const U8 * t = t0; + const U8 * r = r0; + Size_t t_count = 0, r_count = 0; /* Number of characters in search and + replacement lists */ + + /* khw thinks some of the private flags for this op are quaintly named. + * OPpTRANS_GROWS for example is TRUE if the replacement for some lhs + * character when represented in UTF-8 is longer than the original + * character's UTF-8 representation */ const bool complement = cBOOL(o->op_private & OPpTRANS_COMPLEMENT); const bool squash = cBOOL(o->op_private & OPpTRANS_SQUASH); const bool del = cBOOL(o->op_private & OPpTRANS_DELETE); - SV* swash; + + /* Set to true if there is some character < 256 in the lhs that maps to > + * 255. If so, a non-UTF-8 match string can be forced into requiring to be + * in UTF-8 by a tr/// operation. */ + bool can_force_utf8 = FALSE; + + /* What is the maximum expansion factor in UTF-8 transliterations. If a + * 2-byte UTF-8 encoded character is to be replaced by a 3-byte one, its + * expansion factor is 1.5. This number is used at runtime to calculate + * how much space to allocate for non-inplace transliterations. Without + * this number, the worst case is 14, which is extremely unlikely to happen + * in real life, and would require significant memory overhead. */ + NV max_expansion = 1.; + + SSize_t t_range_count, r_range_count, min_range_count; + UV* t_array; + SV* t_invlist; + UV* r_map; + UV r_cp, t_cp; + IV t_cp_end = -1; + UV r_cp_end; + Size_t len; + AV* invmap; + UV final_map = TR_UNLISTED; /* The final character in the replacement + list, updated as we go along. Initialize + to something illegal */ + + bool rstr_utf8 = cBOOL(SvUTF8(rstr)); + bool tstr_utf8 = cBOOL(SvUTF8(tstr)); + + const U8* tend = t + tlen; + const U8* rend = r + rlen; + + SV * inverted_tstr = NULL; + + Size_t i; + unsigned int pass2; + + /* This routine implements detection of a transliteration having a longer + * UTF-8 representation than its source, by partitioning all the possible + * code points of the platform into equivalence classes of the same UTF-8 + * byte length in the first pass. As it constructs the mappings, it carves + * these up into smaller chunks, but doesn't merge any together. This + * makes it easy to find the instances it's looking for. A second pass is + * done after this has been determined which merges things together to + * shrink the table for runtime. For ASCII platforms, the table is + * trivial, given below, and uses the fundamental characteristics of UTF-8 + * to construct the values. For EBCDIC, it isn't so, and we rely on a + * table constructed by the perl script that generates these kinds of + * things */ +#ifndef EBCDIC + UV PL_partition_by_byte_length[] = { + 0, + 0x80, + (32 * (1UL << ( UTF_ACCUMULATION_SHIFT))), + (32 * (1UL << ( UTF_ACCUMULATION_SHIFT))), + (16 * (1UL << (2 * UTF_ACCUMULATION_SHIFT))), + ( 8 * (1UL << (3 * UTF_ACCUMULATION_SHIFT))), + ( 4 * (1UL << (4 * UTF_ACCUMULATION_SHIFT))), + ( 2 * (1UL << (5 * UTF_ACCUMULATION_SHIFT))) + +# ifdef UV_IS_QUAD + , + ( (1UL << (6 * UTF_ACCUMULATION_SHIFT))) + +# endif +#endif + }; PERL_ARGS_ASSERT_PMTRANS; PL_hints |= HINT_BLOCK_SCOPE; - if (SvUTF8(tstr)) - o->op_private |= OPpTRANS_FROM_UTF; + /* If /c, the search list is sorted and complemented. This is now done by + * creating an inversion list from it, and then trivially inverting that. + * The previous implementation used qsort, but creating the list + * automatically keeps it sorted as we go along */ + if (complement) { + UV start, end; + SV * inverted_tlist = _new_invlist(tlen); + Size_t temp_len; + + while (t < tend) { + + /* Non-utf8 strings don't have ranges, so each character is listed + * out */ + if (! tstr_utf8) { + inverted_tlist = add_cp_to_invlist(inverted_tlist, *t); + t++; + } + else { /* But UTF-8 strings have been parsed in toke.c to have + * ranges if appropriate. */ + UV t_cp; + Size_t t_char_len; + + /* Get the first character */ + t_cp = valid_utf8_to_uvchr(t, &t_char_len); + t += t_char_len; + + /* If the next byte indicates that this wasn't the first + * element of a range, the range is just this one */ + if (t >= tend || *t != RANGE_INDICATOR) { + inverted_tlist = add_cp_to_invlist(inverted_tlist, t_cp); + } + else { /* Otherwise, ignore the indicator byte, and get the + final element, and add the whole range */ + t++; + t_cp_end = valid_utf8_to_uvchr(t, &t_char_len); + t += t_char_len; + + inverted_tlist = _add_range_to_invlist(inverted_tlist, + t_cp, t_cp_end); + } + } + } /* End of parse through tstr */ + + /* The inversion list is done; now invert it */ + _invlist_invert(inverted_tlist); + + /* Now go through the inverted list and create a new tstr for the rest + * of the routine to use. Since the UTF-8 version can have ranges, and + * can be much more compact than the non-UTF-8 version, we create the + * string in UTF-8 even if not necessary. (This is just an intermediate + * value that gets thrown away anyway.) */ + invlist_iterinit(inverted_tlist); + inverted_tstr = newSVpvs(""); + while (invlist_iternext(inverted_tlist, &start, &end)) { + U8 temp[UTF8_MAXBYTES]; + U8 * temp_end_pos; + + /* IV_MAX keeps things from going out of bounds */ + start = MIN(IV_MAX, start); + end = MIN(IV_MAX, end); + + temp_end_pos = uvchr_to_utf8(temp, start); + sv_catpvn(inverted_tstr, (char *) temp, temp_end_pos - temp); + + if (start != end) { + Perl_sv_catpvf(aTHX_ inverted_tstr, "%c", RANGE_INDICATOR); + temp_end_pos = uvchr_to_utf8(temp, end); + sv_catpvn(inverted_tstr, (char *) temp, temp_end_pos - temp); + } + } + + /* Set up so the remainder of the routine uses this complement, instead + * of the actual input */ + t0 = t = (U8*)SvPV_const(inverted_tstr, temp_len); + tend = t0 + temp_len; + tstr_utf8 = TRUE; + + SvREFCNT_dec_NN(inverted_tlist); + } + + /* For non-/d, an empty rhs means to use the lhs */ + if (rlen == 0 && ! del) { + r0 = t0; + rend = tend; + rstr_utf8 = tstr_utf8; + } + + t_invlist = _new_invlist(1); + + /* Parse the (potentially adjusted) input, creating the inversion map. + * This is done in two passes. The first pass is to determine if the + * transliteration can be done in place. The inversion map it creates + * could be used, but generally would be larger and slower to run than the + * output of the second pass, which starts with a more compact table and + * allows more ranges to be merged */ + for (pass2 = 0; pass2 < 2; pass2++) { - if (SvUTF8(rstr)) - o->op_private |= OPpTRANS_TO_UTF; + /* Initialize to a single range */ + t_invlist = _add_range_to_invlist(t_invlist, 0, UV_MAX); - if (o->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF)) { + /* In the second pass, we just have the single range */ - /* for utf8 translations, op_sv will be set to point to a swash - * containing codepoint ranges. This is done by first assembling - * a textual representation of the ranges in listsv then compiling - * it using swash_init(). For more details of the textual format, - * see L . + if (pass2) { + len = 1; + t_array = invlist_array(t_invlist); + } + else { + + /* But in the first pass, the lhs is partitioned such that the + * number of UTF-8 bytes required to represent a code point in each + * partition is the same as the number for any other code point in + * that partion. We copy the pre-compiled partion. */ + len = C_ARRAY_LENGTH(PL_partition_by_byte_length); + invlist_extend(t_invlist, len); + t_array = invlist_array(t_invlist); + Copy(PL_partition_by_byte_length, t_array, len, UV); + invlist_set_len(t_invlist, + len, + *(get_invlist_offset_addr(t_invlist))); + Newx(r_map, len + 1, UV); + } + + /* And the mapping of each of the ranges is initialized. Initially, + * everything is TR_UNLISTED. */ + for (i = 0; i < len; i++) { + r_map[i] = TR_UNLISTED; + } + + t = t0; + t_count = 0; + r = r0; + r_count = 0; + t_range_count = r_range_count = 0; + + /* Now go through the search list constructing an inversion map. The + * input is not necessarily in any particular order. Making it an + * inversion map orders it, potentially simplifying, and makes it easy + * to deal with at run time. This is the only place in core that + * generates an inversion map; if others were introduced, it might be + * better to create general purpose routines to handle them. + * (Inversion maps are created in perl in other places.) + * + * An inversion map consists of two parallel arrays. One is + * essentially an inversion list: an ordered list of code points such + * that each element gives the first code point of a range of + * consecutive code points that map to the element in the other array + * that has the same index as this one (in other words, the + * corresponding element). Thus the range extends up to (but not + * including) the code point given by the next higher element. In a + * true inversion map, the corresponding element in the other array + * gives the mapping of the first code point in the range, with the + * understanding that the next higher code point in the inversion + * list's range will map to the next higher code point in the map. + * + * So if at element [i], let's say we have: + * + * t_invlist r_map + * [i] A a + * + * This means that A => a, B => b, C => c.... Let's say that the + * situation is such that: + * + * [i+1] L -1 + * + * This means the sequence that started at [i] stops at K => k. This + * illustrates that you need to look at the next element to find where + * a sequence stops. Except, the highest element in the inversion list + * begins a range that is understood to extend to the platform's + * infinity. + * + * This routine modifies traditional inversion maps to reserve two + * mappings: + * + * TR_UNLISTED (or -1) indicates that the no code point in the range + * is listed in the tr/// searchlist. At runtime, these are + * always passed through unchanged. In the inversion map, all + * points in the range are mapped to -1, instead of increasing, + * like the 'L' in the example above. + * + * We start the parse with every code point mapped to this, and as + * we parse and find ones that are listed in the search list, we + * carve out ranges as we go along that override that. + * + * TR_SPECIAL_HANDLING (or -2) indicates that every code point in the + * range needs special handling. Again, all code points in the + * range are mapped to -2, instead of increasing. + * + * Under /d this value means the code point should be deleted from + * the transliteration when encountered. + * + * Otherwise, it marks that every code point in the range is to + * map to the final character in the replacement list. This + * happens only when the replacement list is shorter than the + * search one, so there are things in the search list that have no + * correspondence in the replacement list. For example, in + * tr/a-z/A/, 'A' is the final value, and the inversion map + * generated for this would be like this: + * \0 => -1 + * a => A + * b-z => -2 + * z+1 => -1 + * 'A' appears once, then the remainder of the range maps to -2. + * The use of -2 isn't strictly necessary, as an inversion map is + * capable of representing this situation, but not nearly so + * compactly, and this is actually quite commonly encountered. + * Indeed, the original design of this code used a full inversion + * map for this. But things like + * tr/\0-\x{FFFF}/A/ + * generated huge data structures, slowly, and the execution was + * also slow. So the current scheme was implemented. + * + * So, if the next element in our example is: + * + * [i+2] Q q + * + * Then all of L, M, N, O, and P map to TR_UNLISTED. If the next + * elements are + * + * [i+3] R z + * [i+4] S TR_UNLISTED + * + * Then Q => q; R => z; and S => TR_UNLISTED. If [i+4] (the 'S') is + * the final element in the arrays, every code point from S to infinity + * maps to TR_UNLISTED. + * */ + /* Finish up range started in what otherwise would + * have been the final iteration */ + while (t < tend || t_range_count > 0) { + bool adjacent_to_range_above = FALSE; + bool adjacent_to_range_below = FALSE; + + bool merge_with_range_above = FALSE; + bool merge_with_range_below = FALSE; + + SSize_t i, span, invmap_range_length_remaining; + + /* If we are in the middle of processing a range in the 'target' + * side, the previous iteration has set us up. Otherwise, look at + * the next character in the search list */ + if (t_range_count <= 0) { + if (! tstr_utf8) { + + /* Here, not in the middle of a range, and not UTF-8. The + * next code point is the single byte where we're at */ + t_cp = *t; + t_range_count = 1; + t++; + } + else { + Size_t t_char_len; + + /* Here, not in the middle of a range, and is UTF-8. The + * next code point is the next UTF-8 char in the input. We + * know the input is valid, because the toker constructed + * it */ + t_cp = valid_utf8_to_uvchr(t, &t_char_len); + t += t_char_len; + + /* UTF-8 strings (only) have been parsed in toke.c to have + * ranges. See if the next byte indicates that this was + * the first element of a range. If so, get the final + * element and calculate the range size. If not, the range + * size is 1 */ + if (t < tend && *t == RANGE_INDICATOR) { + t++; + t_range_count = valid_utf8_to_uvchr(t, &t_char_len) + - t_cp + 1; + t += t_char_len; + } + else { + t_range_count = 1; + } + } - SV* const listsv = newSVpvs("# comment\n"); - SV* transv = NULL; - const U8* tend = t + tlen; - const U8* rend = r + rlen; - STRLEN ulen; - UV tfirst = 1; - UV tlast = 0; - IV tdiff; - STRLEN tcount = 0; - UV rfirst = 1; - UV rlast = 0; - IV rdiff; - STRLEN rcount = 0; - IV diff; - I32 none = 0; - U32 max = 0; - I32 bits; - I32 havefinal = 0; - U32 final = 0; - const I32 from_utf = o->op_private & OPpTRANS_FROM_UTF; - const I32 to_utf = o->op_private & OPpTRANS_TO_UTF; - U8* tsave = NULL; - U8* rsave = NULL; - const U32 flags = UTF8_ALLOW_DEFAULT; - - if (!from_utf) { - STRLEN len = tlen; - t = tsave = bytes_to_utf8(t, &len); - tend = t + len; - } - if (!to_utf && rlen) { - STRLEN len = rlen; - r = rsave = bytes_to_utf8(r, &len); - rend = r + len; - } + /* Count the total number of listed code points * */ + t_count += t_range_count; + } -/* There is a snag with this code on EBCDIC: scan_const() in toke.c has - * encoded chars in native encoding which makes ranges in the EBCDIC 0..255 - * odd. */ + /* Similarly, get the next character in the replacement list */ + if (r_range_count <= 0) { + if (r >= rend) { - if (complement) { - /* utf8 and /c: - * replace t/tlen/tend with a version that has the ranges - * complemented - */ - U8 tmpbuf[UTF8_MAXBYTES+1]; - UV *cp; - UV nextmin = 0; - Newx(cp, 2*tlen, UV); - i = 0; - transv = newSVpvs(""); - - /* convert search string into array of (start,end) range - * codepoint pairs stored in cp[]. Most "ranges" will start - * and end at the same char */ - while (t < tend) { - cp[2*i] = utf8n_to_uvchr(t, tend-t, &ulen, flags); - t += ulen; - /* the toker converts X-Y into (X, RANGE_INDICATOR, Y) */ - if (t < tend && *t == RANGE_INDICATOR) { - t++; - cp[2*i+1] = utf8n_to_uvchr(t, tend-t, &ulen, flags); - t += ulen; - } - else { - cp[2*i+1] = cp[2*i]; - } - i++; - } + /* But if we've exhausted the rhs, there is nothing to map + * to, except the special handling one, and we make the + * range the same size as the lhs one. */ + r_cp = TR_SPECIAL_HANDLING; + r_range_count = t_range_count; + } + else { + if (! rstr_utf8) { + r_cp = *r; + r_range_count = 1; + r++; + } + else { + Size_t r_char_len; + + r_cp = valid_utf8_to_uvchr(r, &r_char_len); + r += r_char_len; + if (r < rend && *r == RANGE_INDICATOR) { + r++; + r_range_count = valid_utf8_to_uvchr(r, + &r_char_len) - r_cp + 1; + r += r_char_len; + } + else { + r_range_count = 1; + } + } - /* sort the ranges */ - qsort(cp, i, 2*sizeof(UV), uvcompare); - - /* Create a utf8 string containing the complement of the - * codepoint ranges. For example if cp[] contains [A,B], [C,D], - * then transv will contain the equivalent of: - * join '', map chr, 0, RANGE_INDICATOR, A - 1, - * B + 1, RANGE_INDICATOR, C - 1, - * D + 1, RANGE_INDICATOR, 0x7fffffff; - * A range of a single char skips the RANGE_INDICATOR and - * end cp. - */ - for (j = 0; j < i; j++) { - UV val = cp[2*j]; - diff = val - nextmin; - if (diff > 0) { - t = uvchr_to_utf8(tmpbuf,nextmin); - sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); - if (diff > 1) { - U8 range_mark = RANGE_INDICATOR; - t = uvchr_to_utf8(tmpbuf, val - 1); - sv_catpvn(transv, (char *)&range_mark, 1); - sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); - } - } - val = cp[2*j+1]; - if (val >= nextmin) - nextmin = val + 1; - } + if (r_cp == TR_SPECIAL_HANDLING) { + r_range_count = t_range_count; + } - t = uvchr_to_utf8(tmpbuf,nextmin); - sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); - { - U8 range_mark = RANGE_INDICATOR; - sv_catpvn(transv, (char *)&range_mark, 1); - } - t = uvchr_to_utf8(tmpbuf, 0x7fffffff); - sv_catpvn(transv, (char*)tmpbuf, t - tmpbuf); - t = (const U8*)SvPVX_const(transv); - tlen = SvCUR(transv); - tend = t + tlen; - Safefree(cp); - } - else if (!rlen && !del) { - r = t; rlen = tlen; rend = tend; - } + /* This is the final character so far */ + final_map = r_cp + r_range_count - 1; - if (!squash) { - if ((!rlen && !del) || t == r || - (tlen == rlen && memEQ((char *)t, (char *)r, tlen))) - { - o->op_private |= OPpTRANS_IDENTICAL; - } - } + r_count += r_range_count; + } + } - /* extract char ranges from t and r and append them to listsv */ - - while (t < tend || tfirst <= tlast) { - /* see if we need more "t" chars */ - if (tfirst > tlast) { - tfirst = (I32)utf8n_to_uvchr(t, tend - t, &ulen, flags); - t += ulen; - if (t < tend && *t == RANGE_INDICATOR) { /* illegal utf8 val indicates range */ - t++; - tlast = (I32)utf8n_to_uvchr(t, tend - t, &ulen, flags); - t += ulen; - } - else - tlast = tfirst; - } + /* Here, we have the next things ready in both sides. They are + * potentially ranges. We try to process as big a chunk as + * possible at once, but the lhs and rhs must be synchronized, so + * things like tr/A-Z/a-ij-z/ will need to be processed in 2 chunks + * */ + min_range_count = MIN(t_range_count, r_range_count); + + /* Search the inversion list for the entry that contains the input + * code point . The inversion map was initialized to cover the + * entire range of possible inputs, so this should not fail. So + * the return value is the index into the list's array of the range + * that contains , that is, 'i' such that array[i] <= cp < + * array[i+1] */ + i = _invlist_search(t_invlist, t_cp); + assert(i >= 0); + + /* Here, the data structure might look like: + * + * index t r Meaning + * [i-1] J j # J-L => j-l + * [i] M -1 # M => default; as do N, O, P, Q + * [i+1] R x # R => x, S => x+1, T => x+2 + * [i+2] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... infinity + * + * where 'x' and 'y' above are not to be taken literally. + * + * The maximum chunk we can handle in this loop iteration, is the + * smallest of the three components: the lhs 't_', the rhs 'r_', + * and the remainder of the range in element [i]. (In pass 1, that + * range will have everything in it be of the same class; we can't + * cross into another class.) 'min_range_count' already contains + * the smallest of the first two values. The final one is + * irrelevant if the map is to the special indicator */ + + invmap_range_length_remaining = ((Size_t) i + 1 < len) + ? t_array[i+1] - t_cp + : IV_MAX - t_cp; + span = MAX(1, MIN(min_range_count, invmap_range_length_remaining)); + + /* The end point of this chunk is where we are, plus the span, but + * never larger than the platform's infinity */ + t_cp_end = MIN(IV_MAX, t_cp + span - 1); + + if (r_cp == TR_SPECIAL_HANDLING) { + r_cp_end = TR_SPECIAL_HANDLING; + } + else { + r_cp_end = MIN(IV_MAX, r_cp + span - 1); + + /* If something on the lhs is below 256, and something on the + * rhs is above, there is a potential mapping here across that + * boundary. Indeed the only way there isn't is if both sides + * start at the same point. That means they both cross at the + * same time. But otherwise one crosses before the other */ + if (t_cp < 256 && r_cp_end > 255 && r_cp != t_cp) { + can_force_utf8 = TRUE; + } + } - /* now see if we need more "r" chars */ - if (rfirst > rlast) { - if (r < rend) { - rfirst = (I32)utf8n_to_uvchr(r, rend - r, &ulen, flags); - r += ulen; - if (r < rend && *r == RANGE_INDICATOR) { /* illegal utf8 val indicates range */ - r++; - rlast = (I32)utf8n_to_uvchr(r, rend - r, &ulen, flags); - r += ulen; - } - else - rlast = rfirst; - } - else { - if (!havefinal++) - final = rlast; - rfirst = rlast = 0xffffffff; - } - } + /* If a character appears in the search list more than once, the + * 2nd and succeeding occurrences are ignored, so only do this + * range if haven't already processed this character. (The range + * has been set up so that all members in it will be of the same + * ilk) */ + if (r_map[i] == TR_UNLISTED) { + + /* This is the first definition for this chunk, hence is valid + * and needs to be processed. Here and in the comments below, + * we use the above sample data. The t_cp chunk must be any + * contiguous subset of M, N, O, P, and/or Q. + * + * In the first pass, the t_invlist has been partitioned so + * that all elements in any single range have the same number + * of bytes in their UTF-8 representations. And the r space is + * either a single byte, or a range of strictly monotonically + * increasing code points. So the final element in the range + * will be represented by no fewer bytes than the initial one. + * That means that if the final code point in the t range has + * at least as many bytes as the final code point in the r, + * then all code points in the t range have at least as many + * bytes as their corresponding r range element. But if that's + * not true, the transliteration of at least the final code + * point grows in length. As an example, suppose we had + * tr/\x{fff0}-\x{fff1}/\x{ffff}-\x{10000}/ + * The UTF-8 for all but 10000 occupies 3 bytes on ASCII + * platforms. We have deliberately set up the data structure + * so that any range in the lhs gets split into chunks for + * processing, such that every code point in a chunk has the + * same number of UTF-8 bytes. We only have to check the final + * code point in the rhs against any code point in the lhs. */ + if ( ! pass2 + && r_cp_end != TR_SPECIAL_HANDLING + && UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end)) + { + NV ratio = UVCHR_SKIP(r_cp_end) / UVCHR_SKIP(t_cp); - /* now see which range will peter out first, if either. */ - tdiff = tlast - tfirst; - rdiff = rlast - rfirst; - tcount += tdiff + 1; - rcount += rdiff + 1; + o->op_private |= OPpTRANS_GROWS; - if (tdiff <= rdiff) - diff = tdiff; - else - diff = rdiff; + /* Now that we know it grows, we can keep track of the + * largest ratio */ + if (ratio > max_expansion) { + max_expansion = ratio; + } + } - if (rfirst == 0xffffffff) { - diff = tdiff; /* oops, pretend rdiff is infinite */ - if (diff > 0) - Perl_sv_catpvf(aTHX_ listsv, "%04lx\t%04lx\tXXXX\n", - (long)tfirst, (long)tlast); - else - Perl_sv_catpvf(aTHX_ listsv, "%04lx\t\tXXXX\n", (long)tfirst); - } - else { - if (diff > 0) - Perl_sv_catpvf(aTHX_ listsv, "%04lx\t%04lx\t%04lx\n", - (long)tfirst, (long)(tfirst + diff), - (long)rfirst); - else - Perl_sv_catpvf(aTHX_ listsv, "%04lx\t\t%04lx\n", - (long)tfirst, (long)rfirst); - - if (rfirst + diff > max) - max = rfirst + diff; - if (!grows) - grows = (tfirst < rfirst && - UVCHR_SKIP(tfirst) < UVCHR_SKIP(rfirst + diff)); - rfirst += diff + 1; - } - tfirst += diff + 1; - } + /* The very first range is marked as adjacent to the + * non-existent range below it, as it causes things to "just + * work" (TradeMark) + * + * If the lowest code point in this chunk is M, it adjoins the + * J-L range */ + if (t_cp == t_array[i]) { + adjacent_to_range_below = TRUE; + + /* And if the map has the same offset from the beginning of + * the range as does this new code point (or both are for + * TR_SPECIAL_HANDLING), this chunk can be completely + * merged with the range below. EXCEPT, in the first pass, + * we don't merge ranges whose UTF-8 byte representations + * have different lengths, so that we can more easily + * detect if a replacement is longer than the source, that + * is if it 'grows'. But in the 2nd pass, there's no + * reason to not merge */ + if ( (i > 0 && ( pass2 + || UVCHR_SKIP(t_array[i-1]) + == UVCHR_SKIP(t_cp))) + && ( ( r_cp == TR_SPECIAL_HANDLING + && r_map[i-1] == TR_SPECIAL_HANDLING) + || ( r_cp != TR_SPECIAL_HANDLING + && r_cp - r_map[i-1] == t_cp - t_array[i-1]))) + { + merge_with_range_below = TRUE; + } + } - /* compile listsv into a swash and attach to o */ + /* Similarly, if the highest code point in this chunk is 'Q', + * it adjoins the range above, and if the map is suitable, can + * be merged with it */ + if ( t_cp_end >= IV_MAX - 1 + || ( (Size_t) i + 1 < len + && (Size_t) t_cp_end + 1 == t_array[i+1])) + { + adjacent_to_range_above = TRUE; + if ((Size_t) i + 1 < len) + if ( ( pass2 + || UVCHR_SKIP(t_cp) == UVCHR_SKIP(t_array[i+1])) + && ( ( r_cp == TR_SPECIAL_HANDLING + && r_map[i+1] == (UV) TR_SPECIAL_HANDLING) + || ( r_cp != TR_SPECIAL_HANDLING + && r_cp_end == r_map[i+1] - 1))) + { + merge_with_range_above = TRUE; + } + } - none = ++max; - if (del) - ++max; + if (merge_with_range_below && merge_with_range_above) { + + /* Here the new chunk looks like M => m, ... Q => q; and + * the range above is like R => r, .... Thus, the [i-1] + * and [i+1] ranges should be seamlessly melded so the + * result looks like + * + * [i-1] J j # J-T => j-t + * [i] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... infinity + */ + Move(t_array + i + 2, t_array + i, len - i - 2, UV); + Move(r_map + i + 2, r_map + i, len - i - 2, UV); + len -= 2; + invlist_set_len(t_invlist, + len, + *(get_invlist_offset_addr(t_invlist))); + } + else if (merge_with_range_below) { + + /* Here the new chunk looks like M => m, .... But either + * (or both) it doesn't extend all the way up through Q; or + * the range above doesn't start with R => r. */ + if (! adjacent_to_range_above) { + + /* In the first case, let's say the new chunk extends + * through O. We then want: + * + * [i-1] J j # J-O => j-o + * [i] P -1 # P => -1, Q => -1 + * [i+1] R x # R => x, S => x+1, T => x+2 + * [i+2] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... + * infinity + */ + t_array[i] = t_cp_end + 1; + r_map[i] = TR_UNLISTED; + } + else { /* Adjoins the range above, but can't merge with it + (because 'x' is not the next map after q) */ + /* + * [i-1] J j # J-Q => j-q + * [i] R x # R => x, S => x+1, T => x+2 + * [i+1] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... + * infinity + */ - if (max > 0xffff) - bits = 32; - else if (max > 0xff) - bits = 16; - else - bits = 8; + Move(t_array + i + 1, t_array + i, len - i - 1, UV); + Move(r_map + i + 1, r_map + i, len - i - 1, UV); + len--; + invlist_set_len(t_invlist, len, + *(get_invlist_offset_addr(t_invlist))); + } + } + else if (merge_with_range_above) { + + /* Here the new chunk ends with Q => q, and the range above + * must start with R => r, so the two can be merged. But + * either (or both) the new chunk doesn't extend all the + * way down to M; or the mapping of the final code point + * range below isn't m */ + if (! adjacent_to_range_below) { + + /* In the first case, let's assume the new chunk starts + * with P => p. Then, because it's merge-able with the + * range above, that range must be R => r. We want: + * + * [i-1] J j # J-L => j-l + * [i] M -1 # M => -1, N => -1 + * [i+1] P p # P-T => p-t + * [i+2] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... + * infinity + */ + t_array[i+1] = t_cp; + r_map[i+1] = r_cp; + } + else { /* Adjoins the range below, but can't merge with it + */ + /* + * [i-1] J j # J-L => j-l + * [i] M x # M-T => x-5 .. x+2 + * [i+1] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... + * infinity + */ + Move(t_array + i + 1, t_array + i, len - i - 1, UV); + Move(r_map + i + 1, r_map + i, len - i - 1, UV); + len--; + t_array[i] = t_cp; + r_map[i] = r_cp; + invlist_set_len(t_invlist, len, + *(get_invlist_offset_addr(t_invlist))); + } + } + else if (adjacent_to_range_below && adjacent_to_range_above) { + /* The new chunk completely fills the gap between the + * ranges on either side, but can't merge with either of + * them. + * + * [i-1] J j # J-L => j-l + * [i] M z # M => z, N => z+1 ... Q => z+4 + * [i+1] R x # R => x, S => x+1, T => x+2 + * [i+2] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... infinity + */ + r_map[i] = r_cp; + } + else if (adjacent_to_range_below) { + /* The new chunk adjoins the range below, but not the range + * above, and can't merge. Let's assume the chunk ends at + * O. + * + * [i-1] J j # J-L => j-l + * [i] M z # M => z, N => z+1, O => z+2 + * [i+1] P -1 # P => -1, Q => -1 + * [i+2] R x # R => x, S => x+1, T => x+2 + * [i+3] U y # U => y, V => y+1, ... + * ... + * [-w] Z -1 # Z => default; as do Z+1, ... infinity + */ + invlist_extend(t_invlist, len + 1); + t_array = invlist_array(t_invlist); + Renew(r_map, len + 1, UV); + + Move(t_array + i + 1, t_array + i + 2, len - i - 1, UV); + Move(r_map + i + 1, r_map + i + 2, len - i - 1, UV); + r_map[i] = r_cp; + t_array[i+1] = t_cp_end + 1; + r_map[i+1] = TR_UNLISTED; + len++; + invlist_set_len(t_invlist, len, + *(get_invlist_offset_addr(t_invlist))); + } + else if (adjacent_to_range_above) { + /* The new chunk adjoins the range above, but not the range + * below, and can't merge. Let's assume the new chunk + * starts at O + * + * [i-1] J j # J-L => j-l + * [i] M -1 # M => default, N => default + * [i+1] O z # O => z, P => z+1, Q => z+2 + * [i+2] R x # R => x, S => x+1, T => x+2 + * [i+3] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... infinity + */ + invlist_extend(t_invlist, len + 1); + t_array = invlist_array(t_invlist); + Renew(r_map, len + 1, UV); + + Move(t_array + i + 1, t_array + i + 2, len - i - 1, UV); + Move(r_map + i + 1, r_map + i + 2, len - i - 1, UV); + t_array[i+1] = t_cp; + r_map[i+1] = r_cp; + len++; + invlist_set_len(t_invlist, len, + *(get_invlist_offset_addr(t_invlist))); + } + else { + /* The new chunk adjoins neither the range above, nor the + * range below. Lets assume it is N..P => n..p + * + * [i-1] J j # J-L => j-l + * [i] M -1 # M => default + * [i+1] N n # N..P => n..p + * [i+2] Q -1 # Q => default + * [i+3] R x # R => x, S => x+1, T => x+2 + * [i+4] U y # U => y, V => y+1, ... + * ... + * [-1] Z -1 # Z => default; as do Z+1, ... infinity + */ - swash = MUTABLE_SV(swash_init("utf8", "", listsv, bits, none)); -#ifdef USE_ITHREADS - cPADOPo->op_padix = pad_alloc(OP_TRANS, SVf_READONLY); - SvREFCNT_dec(PAD_SVl(cPADOPo->op_padix)); - PAD_SETSV(cPADOPo->op_padix, swash); - SvPADTMP_on(swash); - SvREADONLY_on(swash); -#else - cSVOPo->op_sv = swash; -#endif - SvREFCNT_dec(listsv); - SvREFCNT_dec(transv); + invlist_extend(t_invlist, len + 2); + t_array = invlist_array(t_invlist); + Renew(r_map, len + 2, UV); + + Move(t_array + i + 1, + t_array + i + 2 + 1, len - i - (2 - 1), UV); + Move(r_map + i + 1, + r_map + i + 2 + 1, len - i - (2 - 1), UV); + + len += 2; + invlist_set_len(t_invlist, len, + *(get_invlist_offset_addr(t_invlist))); + + t_array[i+1] = t_cp; + r_map[i+1] = r_cp; + + t_array[i+2] = t_cp_end + 1; + r_map[i+2] = TR_UNLISTED; + } + } /* End of this chunk needs to be processed */ + + /* Done with this chunk. */ + t_cp += span; + if (t_cp >= IV_MAX) { + break; + } + t_range_count -= span; + if (r_cp != TR_SPECIAL_HANDLING) { + r_cp += span; + r_range_count -= span; + } + else { + r_range_count = 0; + } + + } /* End of loop through the search list */ + + /* We don't need an exact count, but we do need to know if there is + * anything left over in the replacement list. So, just assume it's + * one byte per character */ + if (rend > r) { + r_count++; + } + } /* End of passes */ + + SvREFCNT_dec(inverted_tstr); - if (!del && havefinal && rlen) - (void)hv_store(MUTABLE_HV(SvRV(swash)), "FINAL", 5, - newSVuv((UV)final), 0); + /* We now have normalized the input into an inversion map. + * + * See if the lhs and rhs are equivalent. If so, this tr/// is a no-op + * except for the count, and streamlined runtime code can be used */ + if (!del && !squash) { + + /* They are identical if they point to same address, or if everything + * maps to UNLISTED or to itself. This catches things that not looking + * at the normalized inversion map doesn't catch, like tr/aa/ab/ or + * tr/\x{100}-\x{104}/\x{100}-\x{102}\x{103}-\x{104} */ + if (r0 != t0) { + for (i = 0; i < len; i++) { + if (r_map[i] != TR_UNLISTED && r_map[i] != t_array[i]) { + goto done_identical_check; + } + } + } + + /* Here have gone through entire list, and didn't find any + * non-identical mappings */ + o->op_private |= OPpTRANS_IDENTICAL; + + done_identical_check: ; + } + + t_array = invlist_array(t_invlist); + + /* If has components above 255, we generally need to use the inversion map + * implementation */ + if ( can_force_utf8 + || ( len > 0 + && t_array[len-1] > 255 + /* If the final range is 0x100-INFINITY and is a special + * mapping, the table implementation can handle it */ + && ! ( t_array[len-1] == 256 + && ( r_map[len-1] == TR_UNLISTED + || r_map[len-1] == TR_SPECIAL_HANDLING)))) + { + SV* r_map_sv; + + /* A UTF-8 op is generated, indicated by this flag. This op is an + * sv_op */ + o->op_private |= OPpTRANS_USE_SVOP; + + if (can_force_utf8) { + o->op_private |= OPpTRANS_CAN_FORCE_UTF8; + } + + /* The inversion map is pushed; first the list. */ + invmap = MUTABLE_AV(newAV()); + av_push(invmap, t_invlist); - Safefree(tsave); - Safefree(rsave); + /* 2nd is the mapping */ + r_map_sv = newSVpvn((char *) r_map, len * sizeof(UV)); + av_push(invmap, r_map_sv); - tlen = tcount; - rlen = rcount; - if (r < rend) - rlen++; - else if (rlast == 0xffffffff) - rlen = 0; + /* 3rd is the max possible expansion factor */ + av_push(invmap, newSVnv(max_expansion)); + + /* Characters that are in the search list, but not in the replacement + * list are mapped to the final character in the replacement list */ + if (! del && r_count < t_count) { + av_push(invmap, newSVuv(final_map)); + } + +#ifdef USE_ITHREADS + cPADOPo->op_padix = pad_alloc(OP_TRANS, SVf_READONLY); + SvREFCNT_dec(PAD_SVl(cPADOPo->op_padix)); + PAD_SETSV(cPADOPo->op_padix, (SV *) invmap); + SvPADTMP_on(invmap); + SvREADONLY_on(invmap); +#else + cSVOPo->op_sv = (SV *) invmap; +#endif - goto warnins; } + else { + OPtrans_map *tbl; + Size_t i; + + /* The OPtrans_map struct already contains one slot; hence the -1. */ + SSize_t struct_size = sizeof(OPtrans_map) + + (256 - 1 + 1)*sizeof(short); /* Non-utf8 case: set o->op_pv to point to a simple 256+ entry lookup * table. Entries with the value TR_UNMAPPED indicate chars not to be * translated, while TR_DELETE indicates a search char without a * corresponding replacement char under /d. * - * Normally, the table has 256 slots. However, in the presence of - * /c, the search charlist has an implicit \x{100}-\x{7fffffff} - * added, and if there are enough replacement chars to start pairing - * with the \x{100},... search chars, then a larger (> 256) table - * is allocated. - * - * In addition, regardless of whether under /c, an extra slot at the - * end is used to store the final repeating char, or TR_R_EMPTY under an - * empty replacement list, or TR_DELETE under /d; which makes the - * runtime code easier. - * - * The toker will have already expanded char ranges in t and r. + * In addition, an extra slot at the end is used to store the final + * repeating char, or TR_R_EMPTY under an empty replacement list, or + * TR_DELETE under /d; which makes the runtime code easier. */ - /* Initially allocate 257-slot table: 256 for basic (non /c) usage, - * plus final slot for repeat/TR_DELETE/TR_R_EMPTY. Later we realloc if - * excess > * 0. The OPtrans_map struct already contains one slot; - * hence the -1. - */ - struct_size = sizeof(OPtrans_map) + (256 - 1 + 1)*sizeof(short); + /* Indicate this is an op_pv */ + o->op_private &= ~OPpTRANS_USE_SVOP; + tbl = (OPtrans_map*)PerlMemShared_calloc(struct_size, 1); tbl->size = 256; cPVOPo->op_pv = (char*)tbl; - if (complement) { - Size_t excess; + for (i = 0; i < len; i++) { + STATIC_ASSERT_DECL(TR_SPECIAL_HANDLING == TR_DELETE); + short upper = i >= len - 1 ? 256 : t_array[i+1]; + short to = r_map[i]; + short j; + bool do_increment = TRUE; - /* in this branch, j is a count of 'consumed' (i.e. paired off - * with a search char) replacement chars (so j <= rlen always) - */ - for (i = 0; i < tlen; i++) - tbl->map[t[i]] = (short) TR_UNMAPPED; - - for (i = 0, j = 0; i < 256; i++) { - if (!tbl->map[i]) { - if (j == rlen) { - if (del) - tbl->map[i] = (short) TR_DELETE; - else if (rlen) - tbl->map[i] = r[j-1]; - else - tbl->map[i] = (short)i; - } - else { - tbl->map[i] = r[j++]; - } - if ( tbl->map[i] >= 0 - && UVCHR_IS_INVARIANT((UV)i) - && !UVCHR_IS_INVARIANT((UV)(tbl->map[i])) - ) - grows = TRUE; - } + /* Any code points above our limit should be irrelevant */ + if (t_array[i] >= tbl->size) break; + + /* Set up the map */ + if (to == (short) TR_SPECIAL_HANDLING && ! del) { + to = final_map; + do_increment = FALSE; + } + else if (to < 0) { + do_increment = FALSE; } - ASSUME(j <= rlen); - excess = rlen - j; + /* Create a map for everything in this range. The value increases + * except for the special cases */ + for (j = t_array[i]; j < upper; j++) { + tbl->map[j] = to; + if (do_increment) to++; + } + } - if (excess) { - /* More replacement chars than search chars: - * store excess replacement chars at end of main table. - */ + tbl->map[tbl->size] = del + ? (short) TR_DELETE + : rlen + ? final_map + : (short) TR_R_EMPTY; - struct_size += excess * sizeof(short); - tbl = (OPtrans_map*)PerlMemShared_realloc(tbl, struct_size); - tbl->size += excess; - cPVOPo->op_pv = (char*)tbl; + SvREFCNT_dec(t_invlist); - for (i = 0; i < excess; i++) - tbl->map[i + 256] = r[j+i]; - } - else { - /* no more replacement chars than search chars */ - if (!rlen && !del && !squash) - o->op_private |= OPpTRANS_IDENTICAL; - } +#if 0 /* code that added excess above-255 chars at the end of the table, in + case we ever want to not use the inversion map implementation for + this */ - tbl->map[tbl->size] = del - ? (short) TR_DELETE - : rlen - ? r[rlen - 1] - : (short) TR_R_EMPTY; + ASSUME(j <= rlen); + excess = rlen - j; + + if (excess) { + /* More replacement chars than search chars: + * store excess replacement chars at end of main table. + */ + + struct_size += excess; + tbl = (OPtrans_map*)PerlMemShared_realloc(tbl, + struct_size + excess * sizeof(short)); + tbl->size += excess; + cPVOPo->op_pv = (char*)tbl; + + for (i = 0; i < excess; i++) + tbl->map[i + 256] = r[j+i]; } else { - if (!rlen && !del) { - r = t; rlen = tlen; - if (!squash) - o->op_private |= OPpTRANS_IDENTICAL; - } - else if (!squash && rlen == tlen && memEQ((char*)t, (char*)r, tlen)) { - o->op_private |= OPpTRANS_IDENTICAL; - } + /* no more replacement chars than search chars */ +#endif - for (i = 0; i < 256; i++) - tbl->map[i] = (short) TR_UNMAPPED; - for (i = 0, j = 0; i < tlen; i++,j++) { - if (j >= rlen) { - if (del) { - if (tbl->map[t[i]] == (short) TR_UNMAPPED) - tbl->map[t[i]] = (short) TR_DELETE; - continue; - } - --j; - } - if (tbl->map[t[i]] == (short) TR_UNMAPPED) { - if ( UVCHR_IS_INVARIANT(t[i]) - && ! UVCHR_IS_INVARIANT(r[j])) - grows = TRUE; - tbl->map[t[i]] = r[j]; - } - } - tbl->map[tbl->size] = del - ? (short) TR_UNMAPPED - : rlen - ? (short) TR_UNMAPPED - : (short) TR_R_EMPTY; - } + } - /* both non-utf8 and utf8 code paths end up here */ + Safefree(r_map); - warnins: - if(del && rlen == tlen) { + if(del && rlen != 0 && r_count == t_count) { Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "Useless use of /d modifier in transliteration operator"); - } else if(rlen > tlen && !complement) { + } else if(r_count > t_count) { Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "Replacement list is longer than search list"); } - if (grows) - o->op_private |= OPpTRANS_GROWS; op_free(expr); op_free(repl); diff --git a/op.h b/op.h index 3781960..537b2ef 100644 --- a/op.h +++ b/op.h @@ -1110,7 +1110,7 @@ C is non-null. For a higher-level interface, see C>. "Use of strings with code points over 0xFF as arguments to " \ "%s operator is not allowed" #endif -#if defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) +#if defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) || defined(PERL_IN_PERL_C) # define TR_UNMAPPED (UV)-1 # define TR_DELETE (UV)-2 # define TR_R_EMPTY (UV)-3 /* rhs (replacement) is empty */ diff --git a/proto.h b/proto.h index 20e4b0e..51c316e 100644 --- a/proto.h +++ b/proto.h @@ -4859,31 +4859,26 @@ STATIC Size_t S_do_trans_complex(pTHX_ SV * const sv, const OPtrans_map * const #define PERL_ARGS_ASSERT_DO_TRANS_COMPLEX \ assert(sv); assert(tbl) -STATIC Size_t S_do_trans_complex_utf8(pTHX_ SV * const sv) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_DO_TRANS_COMPLEX_UTF8 \ - assert(sv) - STATIC Size_t S_do_trans_count(pTHX_ SV * const sv, const OPtrans_map * const tbl) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_DO_TRANS_COUNT \ assert(sv); assert(tbl) -STATIC Size_t S_do_trans_count_utf8(pTHX_ SV * const sv) +STATIC Size_t S_do_trans_count_invmap(pTHX_ SV * const sv, AV * const map) __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_DO_TRANS_COUNT_UTF8 \ - assert(sv) +#define PERL_ARGS_ASSERT_DO_TRANS_COUNT_INVMAP \ + assert(sv); assert(map) + +STATIC Size_t S_do_trans_invmap(pTHX_ SV * const sv, AV * const map) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_DO_TRANS_INVMAP \ + assert(sv); assert(map) STATIC Size_t S_do_trans_simple(pTHX_ SV * const sv, const OPtrans_map * const tbl) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_DO_TRANS_SIMPLE \ assert(sv); assert(tbl) -STATIC Size_t S_do_trans_simple_utf8(pTHX_ SV * const sv) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_DO_TRANS_SIMPLE_UTF8 \ - assert(sv) - #endif #if defined(PERL_IN_DUMP_C) STATIC CV* S_deb_curcv(pTHX_ I32 ix); @@ -5540,12 +5535,6 @@ STATIC SV * S_space_join_names_mortal(pTHX_ char *const *array); STATIC void S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist); #define PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS \ assert(pRExC_state); assert(invlist) -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_ADD_CP_TO_INVLIST -#endif - STATIC U32 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_ADD_DATA \ @@ -5582,13 +5571,6 @@ STATIC SV * S_get_ANYOFM_contents(pTHX_ const regnode * n) STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass* const node); #define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC \ assert(pRExC_state); assert(node) -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(SV* invlist) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR \ - assert(invlist) -#endif - STATIC bool S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode_offset* nodep, UV *code_point_p, int* cp_count, I32 *flagp, const bool strict, const U32 depth); #define PERL_ARGS_ASSERT_GROK_BSLASH_N \ assert(pRExC_state); assert(flagp) @@ -5614,46 +5596,12 @@ PERL_STATIC_INLINE SV* S_invlist_contents(pTHX_ SV* const invlist, const bool tr #endif #ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE void S_invlist_extend(pTHX_ SV* const invlist, const UV len); -#define PERL_ARGS_ASSERT_INVLIST_EXTEND \ - assert(invlist) -#endif -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE UV S_invlist_highest(SV* const invlist) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_INVLIST_HIGHEST \ - assert(invlist) -#endif - -#ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE bool S_invlist_is_iterating(SV* const invlist) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_INVLIST_IS_ITERATING \ assert(invlist) #endif -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE void S_invlist_iterfinish(SV* invlist); -#define PERL_ARGS_ASSERT_INVLIST_ITERFINISH \ - assert(invlist) -#endif -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE void S_invlist_iterinit(SV* invlist); -#define PERL_ARGS_ASSERT_INVLIST_ITERINIT \ - assert(invlist) -#endif -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE bool S_invlist_iternext(SV* invlist, UV* start, UV* end) - __attribute__warn_unused_result__; -#define PERL_ARGS_ASSERT_INVLIST_ITERNEXT \ - assert(invlist); assert(start); assert(end) -#endif - -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset); -#define PERL_ARGS_ASSERT_INVLIST_SET_LEN \ - assert(invlist) -#endif STATIC bool S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc); #define PERL_ARGS_ASSERT_IS_SSC_WORTH_IT \ assert(pRExC_state); assert(ssc) @@ -5811,6 +5759,55 @@ PERL_CALLCONV void Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char* #define PERL_ARGS_ASSERT__INVLIST_DUMP \ assert(file); assert(indent); assert(invlist) #endif +#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_ADD_CP_TO_INVLIST +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(SV* invlist) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR \ + assert(invlist) +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE void S_invlist_extend(pTHX_ SV* const invlist, const UV len); +#define PERL_ARGS_ASSERT_INVLIST_EXTEND \ + assert(invlist) +#endif +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE UV S_invlist_highest(SV* const invlist) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_INVLIST_HIGHEST \ + assert(invlist) +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE void S_invlist_iterfinish(SV* invlist); +#define PERL_ARGS_ASSERT_INVLIST_ITERFINISH \ + assert(invlist) +#endif +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE void S_invlist_iterinit(SV* invlist); +#define PERL_ARGS_ASSERT_INVLIST_ITERINIT \ + assert(invlist) +#endif +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE bool S_invlist_iternext(SV* invlist, UV* start, UV* end) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_INVLIST_ITERNEXT \ + assert(invlist); assert(start); assert(end) +#endif + +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset); +#define PERL_ARGS_ASSERT_INVLIST_SET_LEN \ + assert(invlist) +#endif +#endif #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C) PERL_CALLCONV bool Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b); #define PERL_ARGS_ASSERT__INVLISTEQ \ @@ -5832,7 +5829,7 @@ PERL_CALLCONV void Perl_regprop(pTHX_ const regexp *prog, SV* sv, const regnode* #define PERL_ARGS_ASSERT_REGPROP \ assert(sv); assert(o) #endif -#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) +#if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_REGEXEC_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C) || defined(PERL_IN_OP_C) || defined(PERL_IN_DOOP_C) #ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE bool S__invlist_contains_cp(SV* const invlist, const UV cp) __attribute__warn_unused_result__; diff --git a/toke.c b/toke.c index 0fdcadc..2c448eb 100644 --- a/toke.c +++ b/toke.c @@ -2905,12 +2905,12 @@ S_scan_const(pTHX_ char *start) bool dorange = FALSE; /* are we in a translit range? */ bool didrange = FALSE; /* did we just finish a range? */ bool in_charclass = FALSE; /* within /[...]/ */ - bool d_is_utf8 = FALSE; /* Output constant is UTF8 */ bool s_is_utf8 = cBOOL(UTF); /* Is the source string assumed to be UTF8? But, this can show as true when the source isn't utf8, as for example when it is entirely composed of hex constants */ + bool d_is_utf8 = FALSE; /* Output constant is UTF8 */ STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the number of characters found so far that will expand (into 2 bytes) @@ -2951,11 +2951,6 @@ S_scan_const(pTHX_ char *start) PERL_ARGS_ASSERT_SCAN_CONST; assert(PL_lex_inwhat != OP_TRANSR); - if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) { - /* If we are doing a trans and we know we want UTF8, set expectation */ - d_is_utf8 = PL_parser->lex_sub_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF); - s_is_utf8 = PL_parser->lex_sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - } /* Protect sv from errors and fatal warnings. */ ENTER_with_name("scan_const"); @@ -3646,13 +3641,6 @@ S_scan_const(pTHX_ char *start) } d = (char*)uvchr_to_utf8((U8*)d, uv); - if (PL_lex_inwhat == OP_TRANS - && PL_parser->lex_sub_op) - { - PL_parser->lex_sub_op->op_private |= - (PL_lex_repl ? OPpTRANS_FROM_UTF - : OPpTRANS_TO_UTF); - } } } #ifdef EBCDIC @@ -4133,10 +4121,6 @@ S_scan_const(pTHX_ char *start) SvPOK_on(sv); if (d_is_utf8) { SvUTF8_on(sv); - if (PL_lex_inwhat == OP_TRANS && PL_parser->lex_sub_op) { - PL_parser->lex_sub_op->op_private |= - (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); - } } /* shrink the sv if we allocated more than we used */ @@ -10297,9 +10281,7 @@ S_scan_trans(pTHX_ char *start) o = newPVOP(nondestruct ? OP_TRANSR : OP_TRANS, 0, (char*)NULL); o->op_private &= ~OPpTRANS_ALL; - o->op_private |= del|squash|complement| - (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)| - (DO_UTF8(PL_parser->lex_sub_repl) ? OPpTRANS_TO_UTF : 0); + o->op_private |= del|squash|complement; PL_lex_op = o; pl_yylval.ival = nondestruct ? OP_TRANSR : OP_TRANS; -- 1.8.3.1