This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Potentially avoid work when converting to UTF-8
authorKarl Williamson <khw@cpan.org>
Fri, 19 Aug 2016 00:54:13 +0000 (18:54 -0600)
committerKarl Williamson <khw@cpan.org>
Tue, 3 Jan 2017 04:46:41 +0000 (21:46 -0700)
Some code points < 256 are the same whether represented in UTF-8, or
not.  Others change to require 2 bytes to represent in UTF-8.  When
parsing a string, using UTF-8 is avoided unless necessary, because of
the extra overhead required for processing UTF-8.  This means that when,
during the parse, we discover we need to convert to UTF-8, we have to,
in effect, reparse whatever we have so far to make sure those code
points that differ under UTF-8 get their proper representation.  This
reparsing would not be necessary if we know that the string doesn't have
such code points.

It turns out that keeping track of having seen UTF-8 variant code points
is cheap, requiring no extra branch instructions.  And the payoff is
potentially large, avoiding having to reparse the string.  This commit
changes to keep track.

toke.c

diff --git a/toke.c b/toke.c
index 59d0cf3..f27f298 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -2856,6 +2856,11 @@ S_scan_const(pTHX_ char *start)
                                            when the source isn't utf8, as for
                                            example when it is entirely composed
                                            of hex constants */
                                            when the source isn't utf8, as for
                                            example when it is entirely composed
                                            of hex constants */
+    STRLEN utf8_variant_count = 0;      /* When not in UTF-8, this counts the
+                                           number of characters found so far
+                                           that will expand (into 2 bytes)
+                                           should we have to convert to
+                                           UTF-8) */
     SV *res;                           /* result from charnames */
     STRLEN offset_to_max;   /* The offset in the output to where the range
                                high-end character is temporarily placed */
     SV *res;                           /* result from charnames */
     STRLEN offset_to_max;   /* The offset in the output to where the range
                                high-end character is temporarily placed */
@@ -3396,14 +3401,18 @@ S_scan_const(pTHX_ char *start)
                }
                else {
                    if (!has_utf8 && uv > 255) {
                }
                else {
                    if (!has_utf8 && uv > 255) {
-                       /* Might need to recode whatever we have accumulated so
-                        * far if it contains any chars variant in utf8 or
-                        * utf-ebcdic. */
 
 
+                        /* Here, 'uv' won't fit unless we convert to UTF-8.
+                         * If we've only seen invariants so far, all we have to
+                         * do is turn on the flag */
+                        if (utf8_variant_count == 0) {
+                            SvUTF8_on(sv);
+                        }
+                        else {
                        SvCUR_set(sv, d - SvPVX_const(sv));
                        SvPOK_on(sv);
                        *d = '\0';
                        SvCUR_set(sv, d - SvPVX_const(sv));
                        SvPOK_on(sv);
                        *d = '\0';
-                       /* See Note on sizing above.  */
+
                        sv_utf8_upgrade_flags_grow(
                                        sv,
                                        SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
                        sv_utf8_upgrade_flags_grow(
                                        sv,
                                        SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
@@ -3417,11 +3426,14 @@ S_scan_const(pTHX_ char *start)
                                             * to actually grow again */
                                        UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1);
                        d = SvPVX(sv) + SvCUR(sv);
                                             * to actually grow again */
                                        UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1);
                        d = SvPVX(sv) + SvCUR(sv);
+                        }
+
                        has_utf8 = TRUE;
                     }
 
                     if (! has_utf8) {
                        *d++ = (char)uv;
                        has_utf8 = TRUE;
                     }
 
                     if (! has_utf8) {
                        *d++ = (char)uv;
+                        utf8_variant_count++;
                     }
                    else {
                        /* Usually, there will already be enough room in 'sv'
                     }
                    else {
                        /* Usually, there will already be enough room in 'sv'
@@ -3561,15 +3573,25 @@ S_scan_const(pTHX_ char *start)
                        if (! has_utf8 && (   uv > 0xFF
                                            || PL_lex_inwhat != OP_TRANS))
                         {
                        if (! has_utf8 && (   uv > 0xFF
                                            || PL_lex_inwhat != OP_TRANS))
                         {
+                           /* See Note on sizing above.  */
+                            const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
+
                            SvCUR_set(sv, d - SvPVX_const(sv));
                            SvPOK_on(sv);
                            *d = '\0';
                            SvCUR_set(sv, d - SvPVX_const(sv));
                            SvPOK_on(sv);
                            *d = '\0';
-                           /* See Note on sizing above.  */
+
+                            if (utf8_variant_count == 0) {
+                                SvUTF8_on(sv);
+                                d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
+                            }
+                            else {
                            sv_utf8_upgrade_flags_grow(
                                     sv,
                                     SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
                            sv_utf8_upgrade_flags_grow(
                                     sv,
                                     SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                   OFFUNISKIP(uv) + (STRLEN)(send - e) + 1);
+                                   extra);
                            d = SvPVX(sv) + SvCUR(sv);
                            d = SvPVX(sv) + SvCUR(sv);
+                            }
+
                            has_utf8 = TRUE;
                        }
 
                            has_utf8 = TRUE;
                        }
 
@@ -3728,14 +3750,23 @@ S_scan_const(pTHX_ char *start)
                          /* Upgrade destination to be utf8 if this new
                           * component is */
                        if (! has_utf8 && SvUTF8(res)) {
                          /* Upgrade destination to be utf8 if this new
                           * component is */
                        if (! has_utf8 && SvUTF8(res)) {
+                           /* See Note on sizing above.  */
+                            const STRLEN extra = len + (send - s) + 1;
+
                            SvCUR_set(sv, d - SvPVX_const(sv));
                            SvPOK_on(sv);
                            *d = '\0';
                            SvCUR_set(sv, d - SvPVX_const(sv));
                            SvPOK_on(sv);
                            *d = '\0';
-                           /* See Note on sizing above.  */
+
+                            if (utf8_variant_count == 0) {
+                                SvUTF8_on(sv);
+                                d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
+                            }
+                            else {
                            sv_utf8_upgrade_flags_grow(sv,
                                                SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
                            sv_utf8_upgrade_flags_grow(sv,
                                                SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                               len + (STRLEN)(send - s) + 1);
+                                               extra);
                            d = SvPVX(sv) + SvCUR(sv);
                            d = SvPVX(sv) + SvCUR(sv);
+                            }
                            has_utf8 = TRUE;
                        } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
 
                            has_utf8 = TRUE;
                        } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
 
@@ -3807,11 +3838,16 @@ S_scan_const(pTHX_ char *start)
          * to/from UTF-8.
          *
          * If the input has the same representation in UTF-8 as not, it will be
          * to/from UTF-8.
          *
          * If the input has the same representation in UTF-8 as not, it will be
-         * a single byte, and we don't care about UTF8ness; or if neither
-         * source nor output is UTF-8, just copy the byte */
-        if (NATIVE_BYTE_IS_INVARIANT((U8)(*s)) || (! this_utf8 && ! has_utf8))
-        {
+         * a single byte, and we don't care about UTF8ness; just copy the byte */
+        if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
+           *d++ = *s++;
+        }
+        else if (! this_utf8 && ! has_utf8) {
+            /* If neither source nor output is UTF-8, is also a single byte,
+             * just copy it; but this byte counts should we later have to
+             * convert to UTF-8 */
            *d++ = *s++;
            *d++ = *s++;
+            utf8_variant_count++;
         }
         else if (this_utf8 && has_utf8) {   /* Both UTF-8, can just copy */
            const STRLEN len = UTF8SKIP(s);
         }
         else if (this_utf8 && has_utf8) {   /* Both UTF-8, can just copy */
            const STRLEN len = UTF8SKIP(s);
@@ -3829,16 +3865,26 @@ S_scan_const(pTHX_ char *start)
            const UV nextuv   = (this_utf8)
                                 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
                                 : (UV) ((U8) *s);
            const UV nextuv   = (this_utf8)
                                 ? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
                                 : (UV) ((U8) *s);
-           const STRLEN need = UVCHR_SKIP(nextuv);
+           STRLEN need = UVCHR_SKIP(nextuv);
+
            if (!has_utf8) {
                SvCUR_set(sv, d - SvPVX_const(sv));
                SvPOK_on(sv);
                *d = '\0';
            if (!has_utf8) {
                SvCUR_set(sv, d - SvPVX_const(sv));
                SvPOK_on(sv);
                *d = '\0';
-               /* See Note on sizing above.  */
+
+                /* See Note on sizing above. */
+                need += (STRLEN)(send - s) + 1;
+
+                if (utf8_variant_count == 0) {
+                    SvUTF8_on(sv);
+                    d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
+                }
+                else {
                sv_utf8_upgrade_flags_grow(sv,
                                        SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
                sv_utf8_upgrade_flags_grow(sv,
                                        SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                       need + (STRLEN)(send - s) + 1);
+                                       need);
                d = SvPVX(sv) + SvCUR(sv);
                d = SvPVX(sv) + SvCUR(sv);
+                }
                has_utf8 = TRUE;
            } else if (need > len) {
                /* encoded value larger than old, may need extra space (NOTE:
                has_utf8 = TRUE;
            } else if (need > len) {
                /* encoded value larger than old, may need extra space (NOTE: