fix "\x{100}..." =~ tr/.../.../cd
authorDavid Mitchell <davem@iabyn.com>
Thu, 11 Jan 2018 11:45:49 +0000 (11:45 +0000)
committerDavid Mitchell <davem@iabyn.com>
Fri, 19 Jan 2018 13:45:19 +0000 (13:45 +0000)
In transliterations where the search and replacement charlists are
non-utf8, but where the string being modified contains codepoints >=
0x100, then tr/.../.../cd would always delete all such codepoints, rather
than potentially mapping some of them.

In more detail: in the presence of /c (complement), an implicit
0x100..0x7fffffff is added to a non-utf8 search charlist. If the
replacement list is longer than the < 0x100 part of the search list, then
the last few replacement chars should in principle be paired off against
the first few of (\x100, \x101, ...). However, this wasn't happening. For
example,

    tr/\x00-\xfd/ABCD/cd

should be equivalent to

    tr/\xfe-\x{7fffffff}/ABCD/d

which should
    map:
        \xfe    => A,
        \xff    => B,
        \x{100} => C,
        \x{101} => D,
    and delete \x{102} onwards.

But instead, it behaved like

    tr/\xfe-\x{7fffffff}/AB/d

and deleted all codepoints >= 0x100.

This commit fixes that by using the extended mapping table format
for all /c variants (formerly it excluded /cd).

I also changed a variable holding the mapped char from being I32 to UV:
principally to avoid a casting mess in the fixed code. This may (or may
not), as a side-effect, have fixed possible issues with very large
codepoints.

doop.c
op.c
t/op/tr.t

diff --git a/doop.c b/doop.c
index 7dc3fe6..c7973ff 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -224,7 +224,7 @@ S_do_trans_complex(pTHX_ SV * const sv)
        else
            d = s;
        dstart = d;
-       if (complement && !del)
+       if (complement)
             /* number of replacement chars in excess of any 0x00..0xff
              * search characters */
            excess = (SSize_t)tbl[0x100];
@@ -235,7 +235,8 @@ S_do_trans_complex(pTHX_ SV * const sv)
                STRLEN len;
                const UV comp = utf8n_to_uvchr(s, send - s, &len,
                                               UTF8_ALLOW_DEFAULT);
-               I32 ch;
+               UV     ch;
+                short sch;
 
                if (comp > 0xff) {
                    if (!complement) {
@@ -245,34 +246,40 @@ S_do_trans_complex(pTHX_ SV * const sv)
                    else {
                         /* use the implicit 0x100..0x7fffffff search range */
                        matches++;
-                       if (!del) {
-                           ch =    (excess == -1)             ? (I32)comp :
-                                 (   excess ==  0
-                                  || excess < (IV)comp - 0xff) ? tbl[0x101]
-                                                               : tbl[comp+2];
-                           if ((UV)ch != pch) {
+                        ch = del
+                               /* setting ch to pch forces char to be deleted */
+                             ? ((excess >= (IV)comp - 0xff) ? (UV)tbl[comp+2]
+                                                            : pch           )
+
+                            : (        (excess == -1)             ? comp :
+                                 (UV)((  excess ==  0
+                                      || excess < (IV)comp - 0xff) ? tbl[0x101]
+                                                                   : tbl[comp+2]
+                                     )
+                               );
+                           if (ch != pch) {
                                d = uvchr_to_utf8(d, ch);
-                               pch = (UV)ch;
+                               pch = ch;
                            }
                            s += len;
                            continue;
-                       }
                    }
                }
-               else if ((ch = tbl[comp]) >= 0) {
+               else if ((sch = tbl[comp]) >= 0) {
+                    ch = (UV)sch;
                    matches++;
-                   if ((UV)ch != pch) {
+                   if (ch != pch) {
                        d = uvchr_to_utf8(d, ch);
-                       pch = (UV)ch;
+                       pch = ch;
                    }
                    s += len;
                    continue;
                }
-               else if (ch == -1) {    /* -1 is unmapped character */
+               else if (sch == -1) {   /* -1 is unmapped character */
                    Move(s, d, len, U8);
                    d += len;
                }
-               else if (ch == -2)      /* -2 is delete character */
+               else if (sch == -2)      /* -2 is delete character */
                    matches++;
                s += len;
                pch = 0xfeedface;
@@ -283,7 +290,8 @@ S_do_trans_complex(pTHX_ SV * const sv)
                STRLEN len;
                const UV comp = utf8n_to_uvchr(s, send - s, &len,
                                               UTF8_ALLOW_DEFAULT);
-               I32 ch;
+               UV     ch;
+               short sch;
                if (comp > 0xff) {
                    if (!complement) {
                        Move(s, d, len, U8);
@@ -292,26 +300,32 @@ S_do_trans_complex(pTHX_ SV * const sv)
                    else {
                         /* use the implicit 0x100..0x7fffffff search range */
                        matches++;
-                       if (!del) {
+                        if (del) {
+                             if (excess >= (IV)comp - 0xff) {
+                                ch = (UV)tbl[comp+2];
+                                d = uvchr_to_utf8(d, ch);
+                            }
+                        }
+                        else {
                             /* tr/...//c should call S_do_trans_count
                              * instead */
                             assert(excess != -1);
-                           ch = (   excess ==  0
-                                  || excess < (IV)comp - 0xff) ? tbl[0x101]
-                                                               : tbl[comp+2];
+                           ch = (UV)(   excess ==  0
+                                      || excess < (IV)comp-0xff) ? tbl[0x101]
+                                                                 : tbl[comp+2];
                             d = uvchr_to_utf8(d, ch);
-                       }
+                        }
                    }
                }
-               else if ((ch = tbl[comp]) >= 0) {
-                   d = uvchr_to_utf8(d, ch);
+               else if ((sch = tbl[comp]) >= 0) {
+                   d = uvchr_to_utf8(d, (UV)sch);
                    matches++;
                }
-               else if (ch == -1) {    /* -1 is unmapped character */
+               else if (sch == -1) {   /* -1 is unmapped character */
                    Move(s, d, len, U8);
                    d += len;
                }
-               else if (ch == -2)      /* -2 is delete character */
+               else if (sch == -2)      /* -2 is delete character */
                    matches++;
                s += len;
            }
diff --git a/op.c b/op.c
index 23fa594..b3c3336 100644 (file)
--- a/op.c
+++ b/op.c
@@ -6632,7 +6632,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
     tbl = (short*)PerlMemShared_calloc(
                     /* one slot for 'extra len' count and one slot
                      * for storing of last replacement char */
-                    (complement && !del) ? 258 : 256,
+                    complement  ? 258 : 256,
                     sizeof(short));
     cPVOPo->op_pv = (char*)tbl;
 
@@ -6662,7 +6662,9 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 
         assert(j <= (I32)rlen);
 
-       if (!del) {
+        /* populate extended portion of table */
+
+       {
                     /* the repeat char: it may be used to fill the 0x100+
                      * range. For example,
                      *     tr/\x00-AE-\xff/bcd/c
@@ -6697,7 +6699,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                     repeat_char = 0; /* this value isn't used at runtime */
                     /* -1 excess count indicates empty replacement charlist */
                     excess = -1;
-                    if (!squash)
+                    if (!(squash | del))
                         o->op_private |= OPpTRANS_IDENTICAL;
                 }
            }
index 24108a5..db2134b 100644 (file)
--- a/t/op/tr.t
+++ b/t/op/tr.t
@@ -381,15 +381,12 @@ like $@,
 
     $s = $all255_twice_plus;
     $c = $s =~ tr/\x40-\xbf/\x80-\xbf\x00-\x40/csd;
-    {
-        local $TODO = "missing last 0x40";
-        is $s, join('', map chr,
+    is $s, join('', map chr,
                 0x80..0xbf,
                 (map  { ($_, $_) } 0x40..0xbf),
                 0x00..0x40,
             ),
         "/csd =-U";
-    }
     is $c, 0x140, "/csd =-U count";
 
 
@@ -412,9 +409,7 @@ like $@,
 
     $s = $all255_plus;
     $c = $s =~ tr/\x40-\xbf\xf0-\xff/\x80-\xbf\x00-\x3f/cd;
-    {
-        local $TODO = "missing 30-3f";
-        is $s, join('', map chr,
+    is $s, join('', map chr,
                 0x80..0xbf,
                 0x40..0xbf,
                 0x00..0x2f,
@@ -422,7 +417,6 @@ like $@,
                 0x30..0x3f,
                 ),
             "/cd  <U";
-    }
     is $c, 0x90, "/cd  <U count";
 
     $s = $all255_twice_plus . "\x3f\x3f\x{200}\x{300}";
@@ -441,16 +435,13 @@ like $@,
 
     $s = $all255_twice_plus;
     $c = $s =~ tr/\x40-\xbf\xf0-\xff/\x80-\xbf\x00-\x3f/csd;
-    {
-        local $TODO = "missing 30-3f";
-        is $s, join('', map chr, 0x80..0xbf,
+    is $s, join('', map chr, 0x80..0xbf,
                 (map  { ($_, $_) } 0x40..0xbf),
                 0x00..0x2f,
                 (map  { ($_, $_) } 0xf0..0xff),
                 0x30..0x3f,
             ),
         "/csd <U";
-    }
     is $c, 0x120, "/csd <U count";
 }