add two structs for OP_TRANS
authorDavid Mitchell <davem@iabyn.com>
Fri, 12 Jan 2018 12:00:30 +0000 (12:00 +0000)
committerDavid Mitchell <davem@iabyn.com>
Fri, 19 Jan 2018 13:45:20 +0000 (13:45 +0000)
Originally, the op_pv of an OP_TRANS op pointed to a 256-slot array of
shorts, which contained the translations. However, in the presence of
tr///c, extra information needs to be stored to handle utf8 strings.
The 256 slot array was extended, with slot 0x100 holding a length,
and slots 0x101 holding some extra chars.

This has made things a bit messy, so this commit adds two structs,
one being an array of 256 shorts, and the other being the same but with
some extra fields. So for example tbl->[0x100] has been replaced with
tbl->excess_len.

This commit should make no functional difference, but will allow us
shortly to fix a bug by changing the type of the excess_len field from
short to something bigger, for example.

doop.c
op.c
op.h

diff --git a/doop.c b/doop.c
index 58a49b0..fa908cf 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -42,7 +42,7 @@ S_do_trans_simple(pTHX_ SV * const sv)
     STRLEN len;
     U8 *s = (U8*)SvPV_nomg(sv,len);
     U8 * const send = s+len;
-    const short * const tbl = (short*)cPVOP->op_pv;
+    const OPtrans_map * const tbl = (OPtrans_map*)cPVOP->op_pv;
 
     PERL_ARGS_ASSERT_DO_TRANS_SIMPLE;
 
@@ -52,7 +52,7 @@ S_do_trans_simple(pTHX_ SV * const sv)
     /* First, take care of non-UTF-8 input strings, because they're easy */
     if (!SvUTF8(sv)) {
        while (s < send) {
-           const I32 ch = tbl[*s];
+           const I32 ch = tbl->map[*s];
            if (ch >= 0) {
                matches++;
                *s = (U8)ch;
@@ -78,7 +78,7 @@ S_do_trans_simple(pTHX_ SV * const sv)
 
            /* Need to check this, otherwise 128..255 won't match */
            const UV c = utf8n_to_uvchr(s, send - s, &ulen, UTF8_ALLOW_DEFAULT);
-           if (c < 0x100 && (ch = tbl[c]) >= 0) {
+           if (c < 0x100 && (ch = tbl->map[c]) >= 0) {
                matches++;
                d = uvchr_to_utf8(d, ch);
                s += ulen;
@@ -121,7 +121,7 @@ S_do_trans_count(pTHX_ SV * const sv)
     const U8 *s = (const U8*)SvPV_nomg_const(sv, len);
     const U8 * const send = s + len;
     I32 matches = 0;
-    const short * const tbl = (short*)cPVOP->op_pv;
+    const OPtrans_map * const tbl = (OPtrans_map*)cPVOP->op_pv;
 
     PERL_ARGS_ASSERT_DO_TRANS_COUNT;
 
@@ -130,7 +130,7 @@ S_do_trans_count(pTHX_ SV * const sv)
 
     if (!SvUTF8(sv)) {
        while (s < send) {
-            if (tbl[*s++] >= 0)
+            if (tbl->map[*s++] >= 0)
                 matches++;
        }
     }
@@ -140,7 +140,7 @@ S_do_trans_count(pTHX_ SV * const sv)
            STRLEN ulen;
            const UV c = utf8n_to_uvchr(s, send - s, &ulen, UTF8_ALLOW_DEFAULT);
            if (c < 0x100) {
-               if (tbl[c] >= 0)
+               if (tbl->map[c] >= 0)
                    matches++;
            } else if (complement)
                matches++;
@@ -166,11 +166,11 @@ S_do_trans_complex(pTHX_ SV * const sv)
     U8 *s = (U8*)SvPV_nomg(sv, len);
     U8 * const send = s+len;
     I32 matches = 0;
-    const short * const tbl = (short*)cPVOP->op_pv;
+    const OPtrans_map_ex * const extbl = (OPtrans_map_ex*)cPVOP->op_pv;
 
     PERL_ARGS_ASSERT_DO_TRANS_COMPLEX;
 
-    if (!tbl)
+    if (!extbl)
        Perl_croak(aTHX_ "panic: do_trans_complex line %d",__LINE__);
 
     if (!SvUTF8(sv)) {
@@ -180,7 +180,7 @@ S_do_trans_complex(pTHX_ SV * const sv)
        if (PL_op->op_private & OPpTRANS_SQUASH) {
            const U8* p = send;
            while (s < send) {
-               const I32 ch = tbl[*s];
+               const I32 ch = extbl->map[*s];
                if (ch >= 0) {
                    *d = (U8)ch;
                    matches++;
@@ -196,7 +196,7 @@ S_do_trans_complex(pTHX_ SV * const sv)
        }
        else {
            while (s < send) {
-               const I32 ch = tbl[*s];
+               const I32 ch = extbl->map[*s];
                if (ch >= 0) {
                    matches++;
                    *d++ = (U8)ch;
@@ -227,7 +227,7 @@ S_do_trans_complex(pTHX_ SV * const sv)
        if (complement)
             /* number of replacement chars in excess of any 0x00..0xff
              * search characters */
-           excess = (SSize_t)tbl[0x100];
+           excess = (SSize_t)extbl->excess_len;
 
        if (PL_op->op_private & OPpTRANS_SQUASH) {
            UV pch = 0xfeedface;
@@ -245,16 +245,19 @@ S_do_trans_complex(pTHX_ SV * const sv)
                    }
                    else {
                         /* use the implicit 0x100..0x7fffffff search range */
+                        UV comp100 = comp - 0x100;
                        matches++;
                         ch = del
                                /* setting ch to pch forces char to be deleted */
-                             ? ((excess >= (IV)comp - 0xff) ? (UV)tbl[comp+2]
-                                                            : pch           )
+                             ? ((excess > (IV)comp100)
+                                            ? (UV)extbl->map_ex[comp100]
+                                            : pch           )
 
-                            : (        (excess == -1)             ? comp :
+                            : (        (excess == -1) ? comp :
                                  (UV)((  excess ==  0
-                                      || excess < (IV)comp - 0xff) ? tbl[0x101]
-                                                                   : tbl[comp+2]
+                                      || excess <= (IV)comp100)
+                                            ? extbl->repeat_char
+                                            : extbl->map_ex[comp100]
                                      )
                                );
                         if (ch != pch) {
@@ -265,7 +268,7 @@ S_do_trans_complex(pTHX_ SV * const sv)
                         continue;
                    }
                }
-               else if ((sch = tbl[comp]) >= 0) {
+               else if ((sch = extbl->map[comp]) >= 0) {
                     ch = (UV)sch;
                    matches++;
                    if (ch != pch) {
@@ -299,10 +302,11 @@ S_do_trans_complex(pTHX_ SV * const sv)
                    }
                    else {
                         /* use the implicit 0x100..0x7fffffff search range */
+                        UV comp100 = comp - 0x100;
                        matches++;
                         if (del) {
-                             if (excess >= (IV)comp - 0xff) {
-                                ch = (UV)tbl[comp+2];
+                             if (excess > (IV)comp100) {
+                                ch = (UV)extbl->map_ex[comp100];
                                 d = uvchr_to_utf8(d, ch);
                             }
                         }
@@ -310,14 +314,15 @@ S_do_trans_complex(pTHX_ SV * const sv)
                             /* tr/...//c should call S_do_trans_count
                              * instead */
                             assert(excess != -1);
-                           ch = (UV)(   excess ==  0
-                                      || excess < (IV)comp-0xff) ? tbl[0x101]
-                                                                 : tbl[comp+2];
+                           ch = (   excess ==  0
+                                      || excess <= (IV)comp100)
+                                            ? (UV)extbl->repeat_char
+                                            : (UV)extbl->map_ex[comp100];
                             d = uvchr_to_utf8(d, ch);
                         }
                    }
                }
-               else if ((sch = tbl[comp]) >= 0) {
+               else if ((sch = extbl->map[comp]) >= 0) {
                    d = uvchr_to_utf8(d, (UV)sch);
                    matches++;
                }
diff --git a/op.c b/op.c
index b3c3336..4fb46e1 100644 (file)
--- a/op.c
+++ b/op.c
@@ -6343,7 +6343,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
     I32 i;
     I32 j;
     I32 grows = 0;
-    short *tbl;
+    OPtrans_map *tbl;
 
     const I32 complement = o->op_private & OPpTRANS_COMPLEMENT;
     const I32 squash     = o->op_private & OPpTRANS_SQUASH;
@@ -6629,11 +6629,9 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
      * The toker will have already expanded char ranges in t and r.
      */
 
-    tbl = (short*)PerlMemShared_calloc(
-                    /* one slot for 'extra len' count and one slot
-                     * for storing of last replacement char */
-                    complement  ? 258 : 256,
-                    sizeof(short));
+    tbl = (OPtrans_map*)PerlMemShared_calloc(
+                    complement  ? sizeof(OPtrans_map_ex) : sizeof(OPtrans_map),
+                    sizeof(char));
     cPVOPo->op_pv = (char*)tbl;
 
     if (complement) {
@@ -6641,21 +6639,21 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
          * with a search char) replacement chars (so j <= rlen always)
          */
        for (i = 0; i < (I32)tlen; i++)
-           tbl[t[i]] = -1;
+           tbl->map[t[i]] = -1;
        for (i = 0, j = 0; i < 256; i++) {
-           if (!tbl[i]) {
+           if (!tbl->map[i]) {
                if (j == (I32)rlen) {
                    if (del)
-                       tbl[i] = -2;
+                       tbl->map[i] = -2;
                    else if (rlen)
-                       tbl[i] = r[j-1];
+                       tbl->map[i] = r[j-1];
                    else
-                       tbl[i] = (short)i;
+                       tbl->map[i] = (short)i;
                }
                else {
                    if (UVCHR_IS_INVARIANT(i) && ! UVCHR_IS_INVARIANT(r[j]))
                        grows = 1;
-                   tbl[i] = r[j++];
+                   tbl->map[i] = r[j++];
                }
            }
        }
@@ -6676,17 +6674,18 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                      */
             short   repeat_char;
             SSize_t excess = rlen - (SSize_t)j;
+            OPtrans_map_ex *extbl = (OPtrans_map_ex*)tbl;
 
            if (excess) {
                 /* More replacement chars than search chars:
                  * store excess replacement chars at end of main table.
                  */
 
-               tbl = (short *) PerlMemShared_realloc(tbl,
-                                         (0x102+excess) * sizeof(short));
-               cPVOPo->op_pv = (char*)tbl;
+               extbl = (OPtrans_map_ex *) PerlMemShared_realloc(extbl,
+                            sizeof(OPtrans_map_ex) + excess * sizeof(short));
+               cPVOPo->op_pv = (char*)extbl;
                 for (i = 0; i < (I32)excess; i++)
-                    tbl[0x102+i] = r[j+i];
+                    extbl->map_ex[i] = r[j+i];
                 repeat_char = r[rlen-1];
            }
            else {
@@ -6703,8 +6702,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                         o->op_private |= OPpTRANS_IDENTICAL;
                 }
            }
-            tbl[0x100] = (short)excess;      /* excess char count */
-            tbl[0x101] = (short)repeat_char; /* repeated replace char */
+            extbl->excess_len  = (short)excess;      /* excess char count */
+            extbl->repeat_char = (short)repeat_char; /* repeated replace char */
        }
     }
     else {
@@ -6717,21 +6716,21 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
            o->op_private |= OPpTRANS_IDENTICAL;
        }
        for (i = 0; i < 256; i++)
-           tbl[i] = -1;
+           tbl->map[i] = -1;
        for (i = 0, j = 0; i < (I32)tlen; i++,j++) {
            if (j >= (I32)rlen) {
                if (del) {
-                   if (tbl[t[i]] == -1)
-                       tbl[t[i]] = -2;
+                   if (tbl->map[t[i]] == -1)
+                       tbl->map[t[i]] = -2;
                    continue;
                }
                --j;
            }
-           if (tbl[t[i]] == -1) {
+           if (tbl->map[t[i]] == -1) {
                 if (     UVCHR_IS_INVARIANT(t[i])
                     && ! UVCHR_IS_INVARIANT(r[j]))
                    grows = 1;
-               tbl[t[i]] = r[j];
+               tbl->map[t[i]] = r[j];
            }
        }
     }
diff --git a/op.h b/op.h
index eb62c94..aeee339 100644 (file)
--- a/op.h
+++ b/op.h
@@ -627,6 +627,23 @@ typedef enum {
 #define ref(o, type) doref(o, type, TRUE)
 #endif
 
+
+/* basic and extended translation tables attached to OP_TRANS/OP_TRANSR ops */
+
+typedef struct {
+    short map[256];
+} OPtrans_map;
+
+/* used in the presence of tr///c to record any replacement chars that
+ * are paired with the implicit 0x100..0x7fffffff search chars */
+typedef struct {
+    short map[256];
+    short excess_len; /* number of entries in map_ex[] */
+    short repeat_char;
+    short map_ex[1];  /* Unwarranted chumminess */
+} OPtrans_map_ex;
+
+
 /*
 =head1 Optree Manipulation Functions