This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Encode: synch with CPAN version 2.99
[perl5.git] / cpan / Encode / encengine.c
CommitLineData
017e2add
NIS
1/*
2Data structures for encoding transformations.
3
4Perl works internally in either a native 'byte' encoding or
5in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
6representation. When we do we can use utf8_to_uv().
7
8Most character encodings are either simple byte mappings or
9variable length multi-byte encodings. UTF-8 can be viewed as a
10rather extreme case of the latter.
11
12So to solve an important part of perl's encode needs we need to solve the
13"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
14case. (Where one of multi-bytes will usually be UTF-8.)
15
16The other type of encoding is a shift encoding where a prefix sequence
17determines what subsequent bytes mean. Such encodings have state.
18
19We also need to handle case where a character in one encoding has to be
20represented as multiple characters in the other. e.g. letter+diacritic.
21
22The process can be considered as pseudo perl:
23
24my $dst = '';
25while (length($src))
26 {
20797ee1 27 my $size = src_count($src);
017e2add
NIS
28 my $in_seq = substr($src,0,$size,'');
29 my $out_seq = $s2d_hash{$in_seq};
30 if (defined $out_seq)
31 {
32 $dst .= $out_seq;
33 }
34 else
35 {
36 # an error condition
37 }
38 }
39return $dst;
40
41That has the following components:
42 &src_count - a "rule" for how many bytes make up the next character in the
43 source.
44 %s2d_hash - a mapping from input sequences to output sequences
45
46The problem with that scheme is that it does not allow the output
47character repertoire to affect the characters considered from the
48input.
49
50So we use a "trie" representation which can also be considered
51a state machine:
52
53my $dst = '';
54my $seq = \@s2d_seq;
55my $next = \@s2d_next;
56while (length($src))
57 {
58 my $byte = $substr($src,0,1,'');
59 my $out_seq = $seq->[$byte];
60 if (defined $out_seq)
61 {
62 $dst .= $out_seq;
63 }
64 else
65 {
66 # an error condition
67 }
68 ($next,$seq) = @$next->[$byte] if $next;
69 }
70return $dst;
71
72There is now a pair of data structures to represent everything.
73It is valid for output sequence at a particular point to
74be defined but zero length, that just means "don't know yet".
75For the single byte case there is no 'next' so new tables will be the same as
76the original tables. For a multi-byte case a prefix byte will flip to the tables
77for the next page (adding nothing to the output), then the tables for the page
78will provide the actual output and set tables back to original base page.
79
80This scheme can also handle shift encodings.
81
82A slight enhancement to the scheme also allows for look-ahead - if
83we add a flag to re-add the removed byte to the source we could handle
3cd3edd2 84 a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS)
017e2add
NIS
85 ab -> a (and take b back please)
86
87*/
88
c9955564 89#define PERL_NO_GET_CONTEXT
017e2add
NIS
90#include <EXTERN.h>
91#include <perl.h>
017e2add
NIS
92#include "encode.h"
93
2f2b4ff2 94int
0629a5b3 95do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
d1256cb1 96 STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
017e2add 97{
dd88d393
JH
98 const U8 *s = src;
99 const U8 *send = s + *slen;
100 const U8 *last = s;
101 U8 *d = dst;
220e2d4e 102 U8 *dend = d + dlen, *dlast = d;
dd88d393 103 int code = 0;
20797ee1
DK
104 if (!dst)
105 return ENCODE_NOSPACE;
dd88d393 106 while (s < send) {
c31ca201
SH
107 const encpage_t *e = enc;
108 U8 byte = *s;
109 while (byte > e->max)
110 e++;
111 if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
112 const U8 *cend = s + (e->slen & 0x7f);
113 if (cend <= send) {
114 STRLEN n;
115 if ((n = e->dlen)) {
116 const U8 *out = e->seq + n * (byte - e->min);
117 U8 *oend = d + n;
118 if (dst) {
119 if (oend <= dend) {
120 while (d < oend)
121 *d++ = *out++;
122 }
123 else {
124 /* Out of space */
125 code = ENCODE_NOSPACE;
126 break;
127 }
128 }
129 else
130 d = oend;
131 }
132 enc = e->next;
133 s++;
134 if (s == cend) {
135 if (approx && (e->slen & 0x80))
136 code = ENCODE_FALLBACK;
137 last = s;
138 if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
139 code = ENCODE_FOUND_TERM;
140 break;
141 }
142 dlast = d;
143 }
d1256cb1
RGS
144 }
145 else {
c31ca201
SH
146 /* partial source character */
147 code = ENCODE_PARTIAL;
d1256cb1
RGS
148 break;
149 }
d1256cb1
RGS
150 }
151 else {
c31ca201
SH
152 /* Cannot represent */
153 code = ENCODE_NOREP;
154 break;
d1256cb1
RGS
155 }
156 }
dd88d393
JH
157 *slen = last - src;
158 *dout = d - dst;
159 return code;
017e2add 160}