Commit | Line | Data |
---|---|---|
85982a32 | 1 | /* |
cc7dbc11 | 2 | $Id: Unicode.xs,v 1.7 2003/02/20 14:42:34 dankogai Exp dankogai $ |
85982a32 JH |
3 | */ |
4 | ||
5 | #define PERL_NO_GET_CONTEXT | |
6 | #include "EXTERN.h" | |
7 | #include "perl.h" | |
8 | #include "XSUB.h" | |
6d1c0808 JH |
9 | #define U8 U8 |
10 | #include "../Encode/encode.h" | |
85982a32 JH |
11 | |
12 | #define FBCHAR 0xFFFd | |
13 | #define BOM_BE 0xFeFF | |
14 | #define BOM16LE 0xFFFe | |
15 | #define BOM32LE 0xFFFe0000 | |
16 | #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF ) | |
17 | #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 ) | |
18 | #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF ) | |
19 | #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) ) | |
20 | ||
21 | static UV | |
22 | enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian) | |
23 | { | |
24 | U8 *s = *sp; | |
25 | UV v = 0; | |
26 | if (s+size > e) { | |
27 | croak("Partial character %c",(char) endian); | |
28 | } | |
29 | switch(endian) { | |
30 | case 'N': | |
31 | v = *s++; | |
32 | v = (v << 8) | *s++; | |
33 | case 'n': | |
34 | v = (v << 8) | *s++; | |
35 | v = (v << 8) | *s++; | |
36 | break; | |
37 | case 'V': | |
38 | case 'v': | |
39 | v |= *s++; | |
40 | v |= (*s++ << 8); | |
41 | if (endian == 'v') | |
42 | break; | |
43 | v |= (*s++ << 16); | |
44 | v |= (*s++ << 24); | |
45 | break; | |
46 | default: | |
47 | croak("Unknown endian %c",(char) endian); | |
48 | break; | |
49 | } | |
50 | *sp = s; | |
51 | return v; | |
52 | } | |
53 | ||
54 | void | |
55 | enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value) | |
56 | { | |
57 | U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size); | |
58 | switch(endian) { | |
59 | case 'v': | |
60 | case 'V': | |
61 | d += SvCUR(result); | |
62 | SvCUR_set(result,SvCUR(result)+size); | |
63 | while (size--) { | |
7c436af3 | 64 | *d++ = (U8)(value & 0xFF); |
85982a32 JH |
65 | value >>= 8; |
66 | } | |
67 | break; | |
68 | case 'n': | |
69 | case 'N': | |
70 | SvCUR_set(result,SvCUR(result)+size); | |
71 | d += SvCUR(result); | |
72 | while (size--) { | |
7c436af3 | 73 | *--d = (U8)(value & 0xFF); |
85982a32 JH |
74 | value >>= 8; |
75 | } | |
76 | break; | |
77 | default: | |
78 | croak("Unknown endian %c",(char) endian); | |
79 | break; | |
80 | } | |
81 | } | |
82 | ||
83 | MODULE = Encode::Unicode PACKAGE = Encode::Unicode | |
84 | ||
6d1c0808 JH |
85 | PROTOTYPES: DISABLE |
86 | ||
85982a32 | 87 | void |
6d1c0808 | 88 | decode_xs(obj, str, check = 0) |
85982a32 JH |
89 | SV * obj |
90 | SV * str | |
6d1c0808 | 91 | IV check |
85982a32 JH |
92 | CODE: |
93 | { | |
94 | int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); | |
95 | U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); | |
96 | int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); | |
97 | SV *result = newSVpvn("",0); | |
98 | STRLEN ulen; | |
99 | U8 *s = (U8 *)SvPVbyte(str,ulen); | |
100 | U8 *e = (U8 *)SvEND(str); | |
101 | ST(0) = sv_2mortal(result); | |
102 | SvUTF8_on(result); | |
103 | ||
104 | if (!endian && s+size <= e) { | |
105 | UV bom; | |
106 | endian = (size == 4) ? 'N' : 'n'; | |
107 | bom = enc_unpack(aTHX_ &s,e,size,endian); | |
108 | if (bom != BOM_BE) { | |
109 | if (bom == BOM16LE) { | |
110 | endian = 'v'; | |
111 | } | |
112 | else if (bom == BOM32LE) { | |
113 | endian = 'V'; | |
114 | } | |
115 | else { | |
315b3302 | 116 | croak("%"SVf":Unrecognised BOM %"UVxf, |
0f7c507f | 117 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), |
85982a32 JH |
118 | bom); |
119 | } | |
120 | } | |
121 | #if 0 | |
122 | /* Update endian for this sequence */ | |
123 | hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); | |
124 | #endif | |
125 | } | |
126 | while (s < e && s+size <= e) { | |
127 | UV ord = enc_unpack(aTHX_ &s,e,size,endian); | |
128 | U8 *d; | |
129 | if (size != 4 && invalid_ucs2(ord)) { | |
130 | if (ucs2) { | |
6d1c0808 | 131 | if (check) { |
0f7c507f JH |
132 | croak("%"SVf":no surrogates allowed %"UVxf, |
133 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), | |
85982a32 JH |
134 | ord); |
135 | } | |
136 | if (s+size <= e) { | |
137 | /* skip the next one as well */ | |
6d1c0808 | 138 | enc_unpack(aTHX_ &s,e,size,endian); |
85982a32 JH |
139 | } |
140 | ord = FBCHAR; | |
141 | } | |
142 | else { | |
143 | UV lo; | |
144 | if (!isHiSurrogate(ord)) { | |
0f7c507f JH |
145 | croak("%"SVf":Malformed HI surrogate %"UVxf, |
146 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), | |
85982a32 JH |
147 | ord); |
148 | } | |
149 | if (s+size > e) { | |
150 | /* Partial character */ | |
151 | s -= size; /* back up to 1st half */ | |
152 | break; /* And exit loop */ | |
153 | } | |
154 | lo = enc_unpack(aTHX_ &s,e,size,endian); | |
155 | if (!isLoSurrogate(lo)){ | |
0f7c507f JH |
156 | croak("%"SVf":Malformed LO surrogate %"UVxf, |
157 | *hv_fetch((HV *)SvRV(obj),"Name",4,0), | |
85982a32 JH |
158 | ord); |
159 | } | |
160 | ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00); | |
161 | } | |
162 | } | |
163 | d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1); | |
164 | d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0); | |
165 | SvCUR_set(result,d - (U8 *)SvPVX(result)); | |
166 | } | |
6d1c0808 | 167 | if (s < e) { |
0f7c507f JH |
168 | Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", |
169 | *hv_fetch((HV *)SvRV(obj),"Name",4,0)); | |
6d1c0808 JH |
170 | } |
171 | if (check && !(check & ENCODE_LEAVE_SRC)){ | |
172 | if (s < e) { | |
85982a32 JH |
173 | Move(s,SvPVX(str),e-s,U8); |
174 | SvCUR_set(str,(e-s)); | |
175 | } | |
176 | else { | |
177 | SvCUR_set(str,0); | |
178 | } | |
179 | *SvEND(str) = '\0'; | |
180 | } | |
181 | XSRETURN(1); | |
182 | } | |
183 | ||
184 | void | |
6d1c0808 JH |
185 | encode_xs(obj, utf8, check = 0) |
186 | SV * obj | |
85982a32 | 187 | SV * utf8 |
6d1c0808 | 188 | IV check |
85982a32 JH |
189 | CODE: |
190 | { | |
191 | int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0)); | |
192 | U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0))); | |
193 | int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0)); | |
194 | SV *result = newSVpvn("",0); | |
195 | STRLEN ulen; | |
196 | U8 *s = (U8 *)SvPVutf8(utf8,ulen); | |
197 | U8 *e = (U8 *)SvEND(utf8); | |
198 | ST(0) = sv_2mortal(result); | |
199 | if (!endian) { | |
200 | endian = (size == 4) ? 'N' : 'n'; | |
201 | enc_pack(aTHX_ result,size,endian,BOM_BE); | |
202 | #if 0 | |
203 | /* Update endian for this sequence */ | |
204 | hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0); | |
205 | #endif | |
206 | } | |
207 | while (s < e && s+UTF8SKIP(s) <= e) { | |
208 | STRLEN len; | |
209 | UV ord = utf8n_to_uvuni(s, e-s, &len, 0); | |
210 | s += len; | |
211 | if (size != 4 && invalid_ucs2(ord)) { | |
212 | if (!issurrogate(ord)){ | |
213 | if (ucs2) { | |
6d1c0808 | 214 | if (check) { |
0f7c507f JH |
215 | croak("%"SVf":code point \"\\x{%"UVxf"}\" too high", |
216 | *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord); | |
85982a32 JH |
217 | } |
218 | enc_pack(aTHX_ result,size,endian,FBCHAR); | |
219 | }else{ | |
220 | UV hi = ((ord - 0x10000) >> 10) + 0xD800; | |
221 | UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00; | |
222 | enc_pack(aTHX_ result,size,endian,hi); | |
223 | enc_pack(aTHX_ result,size,endian,lo); | |
224 | } | |
225 | } | |
226 | else { | |
227 | /* not supposed to happen */ | |
228 | enc_pack(aTHX_ result,size,endian,FBCHAR); | |
229 | } | |
230 | } | |
231 | else { | |
232 | enc_pack(aTHX_ result,size,endian,ord); | |
233 | } | |
234 | } | |
6d1c0808 | 235 | if (s < e) { |
0f7c507f JH |
236 | Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character", |
237 | *hv_fetch((HV *)SvRV(obj),"Name",4,0)); | |
6d1c0808 JH |
238 | } |
239 | if (check && !(check & ENCODE_LEAVE_SRC)){ | |
85982a32 | 240 | if (s < e) { |
85982a32 JH |
241 | Move(s,SvPVX(utf8),e-s,U8); |
242 | SvCUR_set(utf8,(e-s)); | |
243 | } | |
244 | else { | |
245 | SvCUR_set(utf8,0); | |
246 | } | |
247 | *SvEND(utf8) = '\0'; | |
248 | } | |
249 | XSRETURN(1); | |
250 | } | |
251 |