This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Don't try to case change surrogates, above-Unicodes
authorKarl Williamson <khw@cpan.org>
Thu, 3 Dec 2015 22:56:36 +0000 (15:56 -0700)
committerKarl Williamson <khw@cpan.org>
Thu, 10 Dec 2015 02:17:58 +0000 (19:17 -0700)
Changing the case (upper, lower, title, fold) of surrogate code points
and non-Unicode code points always yields the original, so there is no
need to actually try it.  And trying it is slow and creates swashes,
which uses up runtime memory.  We test for these code points anyway, so
at the cost of just two gotos and a label, we can skip all that work and
potential memory use.  This is worth doing even though such usage will
be rare in practice.

Running the following command

    blead Porting/bench.pl --perlargs="-Ilib -X" --benchfile=above_unicode path_to_prior_perl=before_this_commit path_to_this_perl=after

on a -O2 no DEBUGGING perl, where file 'above_unicode" contains

    [
        'string::casing::above_unicode' => {
            desc    => 'yes cases vs no casing',
            setup   => 'my $a = "\x{110000}"',
            code    => 'my $b = uc($a)'
        },
    ];

yields this output (the extra cost of swash creation is not included):

 The numbers represent raw counts per loop iteration.

 string::casing::above_unicode
 yes cases vs no casing

        before_this_commit    after
        ------------------ --------
     Ir             1329.0    651.0
     Dr              324.0    190.0
     Dw              149.0     94.0
   COND              192.0    103.0
    IND               13.0     10.0

 COND_m                5.5      0.0
  IND_m                6.0      4.0

  Ir_m1                0.1      0.0
  Dr_m1                0.0      0.0
  Dw_m1                0.0      0.0

  Ir_mm                0.0      0.0
  Dr_mm                0.0      0.0
  Dw_mm                0.0      0.0

utf8.c

diff --git a/utf8.c b/utf8.c
index bb500b4..4c1fed4 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1928,8 +1928,9 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
                Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
                    "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
            }
+            goto cases_to_self;
        }
-       else if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
+       if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
             if (   UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
                 && ckWARN_d(WARN_DEPRECATED))
             {
@@ -1941,6 +1942,7 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
                Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
                    "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
            }
+            goto cases_to_self;
        }
 
        /* Note that non-characters are perfectly legal, so no warning should
@@ -2002,6 +2004,7 @@ S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
 
     /* Here, there was no mapping defined, which means that the code point maps
      * to itself.  Return the inputs */
+  cases_to_self:
     len = UTF8SKIP(p);
     if (p != ustrp) {   /* Don't copy onto itself */
         Copy(p, ustrp, len, U8);