Remove tr///CU (the feature is to be obsoleted by better interfaces).

author Simon Cozens <simon@netthink.co.uk>

Fri, 23 Jun 2000 11:05:40 +0000 (11:05 +0000)

committer Jarkko Hietaniemi <jhi@iki.fi>

Fri, 23 Jun 2000 13:24:15 +0000 (13:24 +0000)
author Simon Cozens <simon@netthink.co.uk>
Fri, 23 Jun 2000 11:05:40 +0000 (11:05 +0000)
committer Jarkko Hietaniemi <jhi@iki.fi>
Fri, 23 Jun 2000 13:24:15 +0000 (13:24 +0000)
diff --git a/doop.c b/doop.c

index ebac52f..fe2df46 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -214,188 +214,6 @@ S_do_trans_UU_count(pTHX_ SV *sv)
  }
  
  STATIC I32
-S_do_trans_UC_simple(pTHX_ SV *sv)
-{
-    dTHR;
-    U8 *s;
-    U8 *send;
-    U8 *d;
-    I32 matches = 0;
-    STRLEN len;
-
-    SV* rv = (SV*)cSVOP->op_sv;
-    HV* hv = (HV*)SvRV(rv);
-    SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
-    UV none = svp ? SvUV(*svp) : 0x7fffffff;
-    UV extra = none + 1;
-    UV final;
-    UV uv;
-
-    s = (U8*)SvPV(sv, len);
-    send = s + len;
-
-    svp = hv_fetch(hv, "FINAL", 5, FALSE);
-    if (svp)
-       final = SvUV(*svp);
-
-    d = s;
-    while (s < send) {
-       if ((uv = swash_fetch(rv, s)) < none) {
-           s += UTF8SKIP(s);
-           matches++;
-           *d++ = (U8)uv;
-       }
-       else if (uv == none) {
-           I32 ulen;
-           uv = utf8_to_uv(s, &ulen);
-           s += ulen;
-           *d++ = (U8)uv;
-       }
-       else if (uv == extra) {
-           s += UTF8SKIP(s);
-           matches++;
-           *d++ = (U8)final;
-       }
-       else
-           s += UTF8SKIP(s);
-    }
-    *d = '\0';
-    SvCUR_set(sv, d - (U8*)SvPVX(sv));
-    SvSETMAGIC(sv);
-
-    return matches;
-}
-
-STATIC I32
-S_do_trans_CU_simple(pTHX_ SV *sv)
-{
-    dTHR;
-    U8 *s;
-    U8 *send;
-    U8 *d;
-    U8 *dst;
-    I32 matches = 0;
-    STRLEN len;
-
-    SV* rv = (SV*)cSVOP->op_sv;
-    HV* hv = (HV*)SvRV(rv);
-    SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
-    UV none = svp ? SvUV(*svp) : 0x7fffffff;
-    UV extra = none + 1;
-    UV final;
-    UV uv;
-    U8 tmpbuf[UTF8_MAXLEN];
-    I32 bits = 16;
-
-    s = (U8*)SvPV(sv, len);
-    send = s + len;
-
-    svp = hv_fetch(hv, "BITS", 4, FALSE);
-    if (svp)
-       bits = (I32)SvIV(*svp);
-
-    svp = hv_fetch(hv, "FINAL", 5, FALSE);
-    if (svp)
-       final = SvUV(*svp);
-
-    Newz(801, d, len * (bits >> 3) + 1, U8);
-    dst = d;
-
-    while (s < send) {
-       uv = *s++;
-       if (uv < 0x80)
-           tmpbuf[0] = uv;
-       else {
-           tmpbuf[0] = (( uv >>  6)         | 0xc0);
-           tmpbuf[1] = (( uv        & 0x3f) | 0x80);
-       }
-
-       if ((uv = swash_fetch(rv, tmpbuf)) < none) {
-           matches++;
-           d = uv_to_utf8(d, uv);
-       }
-       else if (uv == none)
-           d = uv_to_utf8(d, s[-1]);
-       else if (uv == extra) {
-           matches++;
-           d = uv_to_utf8(d, final);
-       }
-    }
-    *d = '\0';
-    sv_usepvn_mg(sv, (char*)dst, d - dst);
-
-    return matches;
-}
-
-/* utf-8 to latin-1 */
-
-STATIC I32
-S_do_trans_UC_trivial(pTHX_ SV *sv)
-{
-    dTHR;
-    U8 *s;
-    U8 *send;
-    U8 *d;
-    STRLEN len;
-
-    s = (U8*)SvPV(sv, len);
-    send = s + len;
-
-    d = s;
-    while (s < send) {
-       if (*s < 0x80)
-           *d++ = *s++;
-       else {
-           I32 ulen;
-           UV uv = utf8_to_uv(s, &ulen);
-           s += ulen;
-           *d++ = (U8)uv;
-       }
-    }
-    *d = '\0';
-    SvCUR_set(sv, d - (U8*)SvPVX(sv));
-    SvSETMAGIC(sv);
-
-    return SvCUR(sv);
-}
-
-/* latin-1 to utf-8 */
-
-STATIC I32
-S_do_trans_CU_trivial(pTHX_ SV *sv)
-{
-    dTHR;
-    U8 *s;
-    U8 *send;
-    U8 *d;
-    U8 *dst;
-    I32 matches;
-    STRLEN len;
-
-    s = (U8*)SvPV(sv, len);
-    send = s + len;
-
-    Newz(801, d, len * 2 + 1, U8);
-    dst = d;
-
-    matches = send - s;
-
-    while (s < send) {
-       if (*s < 0x80)
-           *d++ = *s++;
-       else {
-           UV uv = *s++;
-           *d++ = (( uv >>  6)         | 0xc0);
-           *d++ = (( uv        & 0x3f) | 0x80);
-       }
-    }
-    *d = '\0';
-    sv_usepvn_mg(sv, (char*)dst, d - dst);
-
-    return matches;
-}
-
-STATIC I32
  S_do_trans_UU_complex(pTHX_ SV *sv)
  {
      dTHR;
@@ -601,31 +419,19 @@ Perl_do_trans(pTHX_ SV *sv)
  
      switch (PL_op->op_private & 63) {
      case 0:
-       return do_trans_CC_simple(sv);
-
-    case OPpTRANS_FROM_UTF:
-       return do_trans_UC_simple(sv);
-
-    case OPpTRANS_TO_UTF:
-       return do_trans_CU_simple(sv);
-
-    case OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF:
-       return do_trans_UU_simple(sv);
+    if (SvUTF8(sv)) 
+        return do_trans_UU_simple(sv);
+    else
+        return do_trans_CC_simple(sv);
  
      case OPpTRANS_IDENTICAL:
-       return do_trans_CC_count(sv);
-
-    case OPpTRANS_FROM_UTF|OPpTRANS_IDENTICAL:
-       return do_trans_UC_trivial(sv);
-
-    case OPpTRANS_TO_UTF|OPpTRANS_IDENTICAL:
-       return do_trans_CU_trivial(sv);
-
-    case OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF|OPpTRANS_IDENTICAL:
-       return do_trans_UU_count(sv);
+    if (SvUTF8(sv)) 
+        return do_trans_UU_count(sv);
+    else
+        return do_trans_CC_count(sv);
  
      default:
-       if (PL_op->op_private & (OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF))
+       if (SvUTF8(sv))
             return do_trans_UU_complex(sv); /* could be UC or CU too */
         else
             return do_trans_CC_complex(sv);
diff --git a/embed.pl b/embed.pl

index f807d96..377491d 100755 (executable)
--- a/embed.pl
+++ b/embed.pl
@@ -2048,6 +2048,8 @@ Ap        |U8*    |utf16_to_utf8  |U16* p|U8 *d|I32 bytelen
  Ap     |U8*    |utf16_to_utf8_reversed|U16* p|U8 *d|I32 bytelen
  Ap     |I32    |utf8_distance  |U8 *a|U8 *b
  Ap     |U8*    |utf8_hop       |U8 *s|I32 off
+Ap     |U8*    |utf8_to_bytes  |U8 *s|STRLEN len
+Ap     |U8*    |bytes_to_utf8  |U8 *s|STRLEN len
  Ap     |UV     |utf8_to_uv     |U8 *s|I32* retlen
  Ap     |U8*    |uv_to_utf8     |U8 *d|UV uv
  p      |void   |vivify_defelem |SV* sv
@@ -2188,8 +2190,6 @@ s |I32    |do_trans_CC_complex    |SV *sv
  s      |I32    |do_trans_UU_simple     |SV *sv
  s      |I32    |do_trans_UU_count      |SV *sv
  s      |I32    |do_trans_UU_complex    |SV *sv
-s      |I32    |do_trans_UC_simple     |SV *sv
-s      |I32    |do_trans_CU_simple     |SV *sv
  s      |I32    |do_trans_UC_trivial    |SV *sv
  s      |I32    |do_trans_CU_trivial    |SV *sv
  #endif
diff --git a/pod/perlop.pod b/pod/perlop.pod

index b4caed9..3c84e60 100644 (file)
--- a/pod/perlop.pod
+++ b/pod/perlop.pod
@@ -1207,9 +1207,9 @@ to occur that you might want.  Here are two common cases:
      # expand tabs to 8-column spacing
      1 while s/\t+/' ' x (length($&)*8 - length($`)%8)/e;
  
-=item tr/SEARCHLIST/REPLACEMENTLIST/cdsUC
+=item tr/SEARCHLIST/REPLACEMENTLIST/cds
  
-=item y/SEARCHLIST/REPLACEMENTLIST/cdsUC
+=item y/SEARCHLIST/REPLACEMENTLIST/cds
  
  Transliterates all occurrences of the characters found in the search list
  with the corresponding character in the replacement list.  It returns
@@ -1243,8 +1243,6 @@ Options:
      c  Complement the SEARCHLIST.
      d  Delete found but unreplaced characters.
      s  Squash duplicate replaced characters.
-    U  Translate to/from UTF-8.
-    C  Translate to/from 8-bit char (octet).
  
  If the C</c> modifier is specified, the SEARCHLIST character set
  is complemented.  If the C</d> modifier is specified, any characters
@@ -1262,10 +1260,6 @@ enough.  If the REPLACEMENTLIST is empty, the SEARCHLIST is replicated.
  This latter is useful for counting characters in a class or for
  squashing character sequences in a class.
  
-The first C</U> or C</C> modifier applies to the left side of the translation.
-The second one applies to the right side.  If present, these modifiers override
-the current utf8 state.
-
  Examples:
  
      $ARGV[1] =~ tr/A-Z/a-z/;   # canonicalize to lower case
@@ -1285,9 +1279,6 @@ Examples:
      tr [\200-\377]
         [\000-\177];            # delete 8th bit
  
-    tr/\0-\xFF//CU;            # change Latin-1 to Unicode
-    tr/\0-\x{FF}//UC;          # change Unicode to Latin-1
-
  If multiple transliterations are given for a character, only the
  first one is used:
  
diff --git a/toke.c b/toke.c

index 05822e1..fe14358 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -6130,45 +6130,20 @@ S_scan_trans(pTHX_ char *start)
         Perl_croak(aTHX_ "Transliteration replacement not terminated");
      }
  
-    if (UTF) {
-       o = newSVOP(OP_TRANS, 0, 0);
-       utf8 = OPpTRANS_FROM_UTF|OPpTRANS_TO_UTF;
-    }
-    else {
         New(803,tbl,256,short);
         o = newPVOP(OP_TRANS, 0, (char*)tbl);
-       utf8 = 0;
-    }
  
      complement = del = squash = 0;
-    while (strchr("cdsCU", *s)) {
+    while (strchr("cds", *s)) {
         if (*s == 'c')
             complement = OPpTRANS_COMPLEMENT;
         else if (*s == 'd')
             del = OPpTRANS_DELETE;
         else if (*s == 's')
             squash = OPpTRANS_SQUASH;
-       else {
-           switch (count++) {
-           case 0:
-               if (*s == 'C')
-                   utf8 &= ~OPpTRANS_FROM_UTF;
-               else
-                   utf8 |= OPpTRANS_FROM_UTF;
-               break;
-           case 1:
-               if (*s == 'C')
-                   utf8 &= ~OPpTRANS_TO_UTF;
-               else
-                   utf8 |= OPpTRANS_TO_UTF;
-               break;
-           default: 
-               Perl_croak(aTHX_ "Too many /C and /U options");
-           }
-       }
         s++;
      }
-    o->op_private = del|squash|complement|utf8;
+    o->op_private = del|squash|complement;
  
      PL_lex_op = o;
      yylval.ival = OP_TRANS;
diff --git a/utf8.c b/utf8.c

index 76eb932..b570b12 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -222,6 +222,72 @@ Perl_utf8_hop(pTHX_ U8 *s, I32 off)
      return s;
  }
  
+/*
+=for apidoc utf8_to_bytes
+
+Converts a string C<s> of length C<len> from UTF8 into ASCII encoding.
+Unlike C<bytes_to_utf8>, this over-writes the original string.
+
+=cut
+*/
+
+U8 *
+Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN len)
+{
+    dTHR;
+    U8 *send;
+    U8 *d;
+    U8 *save;
+
+    send = s + len;
+    d = save = s;
+    while (s < send) {
+        if (*s < 0x80)
+            *d++ = *s++;
+        else {
+            I32 ulen;
+            UV uv = utf8_to_uv(s, &ulen);
+            s += ulen;
+            *d++ = (U8)uv;
+        }
+    }
+    *d = '\0';
+    return save;
+}
+
+/*
+=for apidoc bytes_to_utf8
+
+Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
+Returns a pointer to the newly-created string.
+
+*/
+
+U8*
+Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN len)
+{
+    dTHR;
+    U8 *send;
+    U8 *d;
+    U8 *dst;
+    send = s + len;
+
+    Newz(801, d, len * 2 + 1, U8);
+    dst = d;
+
+    while (s < send) {
+        if (*s < 0x80)
+            *d++ = *s++;
+        else {
+            UV uv = *s++;
+            *d++ = (( uv >>  6)         | 0xc0);
+            *d++ = (( uv        & 0x3f) | 0x80);
+        }
+    }
+    *d = '\0';
+    return dst;
+}
+
  /* XXX NOTHING CALLS THE FOLLOWING TWO ROUTINES YET!!! */
  /*
   * Convert native or reversed UTF-16 to UTF-8.
author	Simon Cozens <simon@netthink.co.uk>
	Fri, 23 Jun 2000 11:05:40 +0000 (11:05 +0000)
committer	Jarkko Hietaniemi <jhi@iki.fi>
	Fri, 23 Jun 2000 13:24:15 +0000 (13:24 +0000)
doop.c		patch \| blob \| blame \| history
embed.pl		patch \| blob \| blame \| history
pod/perlop.pod		patch \| blob \| blame \| history
toke.c		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history