ext/Encode/encengine.c

   1 /*
   2 Data structures for encoding transformations.
   3
   4 Perl works internally in either a native 'byte' encoding or
   5 in UTF-8 encoded Unicode.  We have no immediate need for a "wchar_t"
   6 representation. When we do we can use utf8_to_uv().
   7
   8 Most character encodings are either simple byte mappings or
   9 variable length multi-byte encodings. UTF-8 can be viewed as a
  10 rather extreme case of the latter.
  11
  12 So to solve an important part of perl's encode needs we need to solve the
  13 "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
  14 case. (Where one of multi-bytes will usually be UTF-8.)
  15
  16 The other type of encoding is a shift encoding where a prefix sequence
  17 determines what subsequent bytes mean. Such encodings have state.
  18
  19 We also need to handle case where a character in one encoding has to be
  20 represented as multiple characters in the other. e.g. letter+diacritic.
  21
  22 The process can be considered as pseudo perl:
  23
  24 my $dst = '';
  25 while (length($src))
  26  {
  27   my $size    = $count($src);
  28   my $in_seq  = substr($src,0,$size,'');
  29   my $out_seq = $s2d_hash{$in_seq};
  30   if (defined $out_seq)
  31    {
  32     $dst .= $out_seq;
  33    }
  34   else
  35    {
  36     # an error condition
  37    }
  38  }
  39 return $dst;
  40
  41 That has the following components:
  42  &src_count - a "rule" for how many bytes make up the next character in the
  43               source.
  44  %s2d_hash  - a mapping from input sequences to output sequences
  45
  46 The problem with that scheme is that it does not allow the output
  47 character repertoire to affect the characters considered from the
  48 input.
  49
  50 So we use a "trie" representation which can also be considered
  51 a state machine:
  52
  53 my $dst   = '';
  54 my $seq   = \@s2d_seq;
  55 my $next  = \@s2d_next;
  56 while (length($src))
  57  {
  58   my $byte    = $substr($src,0,1,'');
  59   my $out_seq = $seq->[$byte];
  60   if (defined $out_seq)
  61    {
  62     $dst .= $out_seq;
  63    }
  64   else
  65    {
  66     # an error condition
  67    }
  68   ($next,$seq) = @$next->[$byte] if $next;
  69  }
  70 return $dst;
  71
  72 There is now a pair of data structures to represent everything.
  73 It is valid for output sequence at a particular point to
  74 be defined but zero length, that just means "don't know yet".
  75 For the single byte case there is no 'next' so new tables will be the same as
  76 the original tables. For a multi-byte case a prefix byte will flip to the tables
  77 for  the next page (adding nothing to the output), then the tables for the page
  78 will provide the actual output and set tables back to original base page.
  79
  80 This scheme can also handle shift encodings.
  81
  82 A slight enhancement to the scheme also allows for look-ahead - if
  83 we add a flag to re-add the removed byte to the source we could handle
  84   a" -> ä
  85   ab -> a (and take b back please)
  86
  87 */
  88
  89 #include <EXTERN.h>
  90 #include <perl.h>
  91 #define U8 U8
  92 #include "encode.h"
  93
  94 int
  95 do_encode(encpage_t *enc, const U8 *src, STRLEN *slen, U8 *dst, STRLEN dlen, STRLEN *dout, int approx)
  96 {
  97  const U8 *s    = src;
  98  const U8 *send = s+*slen;
  99  const U8 *last = s;
 100  U8 *d          = dst;
 101  U8 *dend       = d+dlen;
 102  int code       = 0;
 103  while (s < send)
 104   {
 105    encpage_t *e = enc;
 106    U8 byte = *s;
 107    while (byte > e->max)
 108     e++;
 109    if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80)))
 110     {
 111      const U8 *cend = s + (e->slen & 0x7f);
 112      if (cend <= send)
 113       {
 114        STRLEN n;
 115        if ((n = e->dlen))
 116         {
 117          const U8 *out  = e->seq+n*(byte - e->min);
 118          U8 *oend = d+n;
 119          if (dst)
 120           {
 121            if (oend <= dend)
 122             {
 123              while (d < oend)
 124               *d++ = *out++;
 125             }
 126            else
 127             {
 128              /* Out of space */
 129              code = ENCODE_NOSPACE;
 130              break;
 131             }
 132           }
 133          else
 134           d = oend;
 135         }
 136        enc = e->next;
 137        s++;
 138        if (s == cend)
 139         {
 140          if (approx && (e->slen & 0x80))
 141           code = ENCODE_FALLBACK;
 142          last = s;
 143         }
 144       }
 145      else
 146       {
 147        /* partial source character */
 148        code = ENCODE_PARTIAL;
 149        break;
 150       }
 151     }
 152    else
 153     {
 154      /* Cannot represent */
 155      code = ENCODE_NOREP;
 156      break;
 157     }
 158   }
 159  *slen = last - src;
 160  *dout = d - dst;
 161  return code;
 162 }
 163
 164