Commit | Line | Data |
---|---|---|
017e2add NIS |
1 | /* |
2 | Data structures for encoding transformations. | |
3 | ||
4 | Perl works internally in either a native 'byte' encoding or | |
5 | in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t" | |
6 | representation. When we do we can use utf8_to_uv(). | |
7 | ||
8 | Most character encodings are either simple byte mappings or | |
9 | variable length multi-byte encodings. UTF-8 can be viewed as a | |
10 | rather extreme case of the latter. | |
11 | ||
12 | So to solve an important part of perl's encode needs we need to solve the | |
13 | "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate | |
14 | case. (Where one of multi-bytes will usually be UTF-8.) | |
15 | ||
16 | The other type of encoding is a shift encoding where a prefix sequence | |
17 | determines what subsequent bytes mean. Such encodings have state. | |
18 | ||
19 | We also need to handle case where a character in one encoding has to be | |
20 | represented as multiple characters in the other. e.g. letter+diacritic. | |
21 | ||
22 | The process can be considered as pseudo perl: | |
23 | ||
24 | my $dst = ''; | |
25 | while (length($src)) | |
26 | { | |
20797ee1 | 27 | my $size = src_count($src); |
017e2add NIS |
28 | my $in_seq = substr($src,0,$size,''); |
29 | my $out_seq = $s2d_hash{$in_seq}; | |
30 | if (defined $out_seq) | |
31 | { | |
32 | $dst .= $out_seq; | |
33 | } | |
34 | else | |
35 | { | |
36 | # an error condition | |
37 | } | |
38 | } | |
39 | return $dst; | |
40 | ||
41 | That has the following components: | |
42 | &src_count - a "rule" for how many bytes make up the next character in the | |
43 | source. | |
44 | %s2d_hash - a mapping from input sequences to output sequences | |
45 | ||
46 | The problem with that scheme is that it does not allow the output | |
47 | character repertoire to affect the characters considered from the | |
48 | input. | |
49 | ||
50 | So we use a "trie" representation which can also be considered | |
51 | a state machine: | |
52 | ||
53 | my $dst = ''; | |
54 | my $seq = \@s2d_seq; | |
55 | my $next = \@s2d_next; | |
56 | while (length($src)) | |
57 | { | |
58 | my $byte = $substr($src,0,1,''); | |
59 | my $out_seq = $seq->[$byte]; | |
60 | if (defined $out_seq) | |
61 | { | |
62 | $dst .= $out_seq; | |
63 | } | |
64 | else | |
65 | { | |
66 | # an error condition | |
67 | } | |
68 | ($next,$seq) = @$next->[$byte] if $next; | |
69 | } | |
70 | return $dst; | |
71 | ||
72 | There is now a pair of data structures to represent everything. | |
73 | It is valid for output sequence at a particular point to | |
74 | be defined but zero length, that just means "don't know yet". | |
75 | For the single byte case there is no 'next' so new tables will be the same as | |
76 | the original tables. For a multi-byte case a prefix byte will flip to the tables | |
77 | for the next page (adding nothing to the output), then the tables for the page | |
78 | will provide the actual output and set tables back to original base page. | |
79 | ||
80 | This scheme can also handle shift encodings. | |
81 | ||
82 | A slight enhancement to the scheme also allows for look-ahead - if | |
83 | we add a flag to re-add the removed byte to the source we could handle | |
3cd3edd2 | 84 | a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS) |
017e2add NIS |
85 | ab -> a (and take b back please) |
86 | ||
87 | */ | |
88 | ||
c9955564 | 89 | #define PERL_NO_GET_CONTEXT |
017e2add NIS |
90 | #include <EXTERN.h> |
91 | #include <perl.h> | |
017e2add NIS |
92 | #include "encode.h" |
93 | ||
2f2b4ff2 | 94 | int |
0629a5b3 | 95 | do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst, |
d1256cb1 | 96 | STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen) |
017e2add | 97 | { |
dd88d393 JH |
98 | const U8 *s = src; |
99 | const U8 *send = s + *slen; | |
100 | const U8 *last = s; | |
101 | U8 *d = dst; | |
220e2d4e | 102 | U8 *dend = d + dlen, *dlast = d; |
dd88d393 | 103 | int code = 0; |
20797ee1 DK |
104 | if (!dst) |
105 | return ENCODE_NOSPACE; | |
dd88d393 | 106 | while (s < send) { |
c31ca201 SH |
107 | const encpage_t *e = enc; |
108 | U8 byte = *s; | |
109 | while (byte > e->max) | |
110 | e++; | |
111 | if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) { | |
112 | const U8 *cend = s + (e->slen & 0x7f); | |
113 | if (cend <= send) { | |
114 | STRLEN n; | |
115 | if ((n = e->dlen)) { | |
116 | const U8 *out = e->seq + n * (byte - e->min); | |
117 | U8 *oend = d + n; | |
118 | if (dst) { | |
119 | if (oend <= dend) { | |
120 | while (d < oend) | |
121 | *d++ = *out++; | |
122 | } | |
123 | else { | |
124 | /* Out of space */ | |
125 | code = ENCODE_NOSPACE; | |
126 | break; | |
127 | } | |
128 | } | |
129 | else | |
130 | d = oend; | |
131 | } | |
132 | enc = e->next; | |
133 | s++; | |
134 | if (s == cend) { | |
135 | if (approx && (e->slen & 0x80)) | |
136 | code = ENCODE_FALLBACK; | |
137 | last = s; | |
138 | if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) { | |
139 | code = ENCODE_FOUND_TERM; | |
140 | break; | |
141 | } | |
142 | dlast = d; | |
143 | } | |
d1256cb1 RGS |
144 | } |
145 | else { | |
c31ca201 SH |
146 | /* partial source character */ |
147 | code = ENCODE_PARTIAL; | |
d1256cb1 RGS |
148 | break; |
149 | } | |
d1256cb1 RGS |
150 | } |
151 | else { | |
c31ca201 SH |
152 | /* Cannot represent */ |
153 | code = ENCODE_NOREP; | |
154 | break; | |
d1256cb1 RGS |
155 | } |
156 | } | |
dd88d393 JH |
157 | *slen = last - src; |
158 | *dout = d - dst; | |
159 | return code; | |
017e2add | 160 | } |