This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[Encode] 1.88 Released
[perl5.git] / ext / Encode / Unicode / Unicode.xs
CommitLineData
85982a32 1/*
cc7dbc11 2 $Id: Unicode.xs,v 1.7 2003/02/20 14:42:34 dankogai Exp dankogai $
85982a32
JH
3 */
4
5#define PERL_NO_GET_CONTEXT
6#include "EXTERN.h"
7#include "perl.h"
8#include "XSUB.h"
6d1c0808
JH
9#define U8 U8
10#include "../Encode/encode.h"
85982a32
JH
11
12#define FBCHAR 0xFFFd
13#define BOM_BE 0xFeFF
14#define BOM16LE 0xFFFe
15#define BOM32LE 0xFFFe0000
16#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
17#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
18#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
19#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
20
21static UV
22enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
23{
24 U8 *s = *sp;
25 UV v = 0;
26 if (s+size > e) {
27 croak("Partial character %c",(char) endian);
28 }
29 switch(endian) {
30 case 'N':
31 v = *s++;
32 v = (v << 8) | *s++;
33 case 'n':
34 v = (v << 8) | *s++;
35 v = (v << 8) | *s++;
36 break;
37 case 'V':
38 case 'v':
39 v |= *s++;
40 v |= (*s++ << 8);
41 if (endian == 'v')
42 break;
43 v |= (*s++ << 16);
44 v |= (*s++ << 24);
45 break;
46 default:
47 croak("Unknown endian %c",(char) endian);
48 break;
49 }
50 *sp = s;
51 return v;
52}
53
54void
55enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
56{
57 U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size);
58 switch(endian) {
59 case 'v':
60 case 'V':
61 d += SvCUR(result);
62 SvCUR_set(result,SvCUR(result)+size);
63 while (size--) {
7c436af3 64 *d++ = (U8)(value & 0xFF);
85982a32
JH
65 value >>= 8;
66 }
67 break;
68 case 'n':
69 case 'N':
70 SvCUR_set(result,SvCUR(result)+size);
71 d += SvCUR(result);
72 while (size--) {
7c436af3 73 *--d = (U8)(value & 0xFF);
85982a32
JH
74 value >>= 8;
75 }
76 break;
77 default:
78 croak("Unknown endian %c",(char) endian);
79 break;
80 }
81}
82
83MODULE = Encode::Unicode PACKAGE = Encode::Unicode
84
6d1c0808
JH
85PROTOTYPES: DISABLE
86
85982a32 87void
6d1c0808 88decode_xs(obj, str, check = 0)
85982a32
JH
89SV * obj
90SV * str
6d1c0808 91IV check
85982a32
JH
92CODE:
93{
94 int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
95 U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
96 int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
97 SV *result = newSVpvn("",0);
98 STRLEN ulen;
99 U8 *s = (U8 *)SvPVbyte(str,ulen);
100 U8 *e = (U8 *)SvEND(str);
101 ST(0) = sv_2mortal(result);
102 SvUTF8_on(result);
103
104 if (!endian && s+size <= e) {
105 UV bom;
106 endian = (size == 4) ? 'N' : 'n';
107 bom = enc_unpack(aTHX_ &s,e,size,endian);
108 if (bom != BOM_BE) {
109 if (bom == BOM16LE) {
110 endian = 'v';
111 }
112 else if (bom == BOM32LE) {
113 endian = 'V';
114 }
115 else {
315b3302 116 croak("%"SVf":Unrecognised BOM %"UVxf,
0f7c507f 117 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
85982a32
JH
118 bom);
119 }
120 }
121#if 0
122 /* Update endian for this sequence */
123 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
124#endif
125 }
126 while (s < e && s+size <= e) {
127 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
128 U8 *d;
129 if (size != 4 && invalid_ucs2(ord)) {
130 if (ucs2) {
6d1c0808 131 if (check) {
0f7c507f
JH
132 croak("%"SVf":no surrogates allowed %"UVxf,
133 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
85982a32
JH
134 ord);
135 }
136 if (s+size <= e) {
137 /* skip the next one as well */
6d1c0808 138 enc_unpack(aTHX_ &s,e,size,endian);
85982a32
JH
139 }
140 ord = FBCHAR;
141 }
142 else {
143 UV lo;
144 if (!isHiSurrogate(ord)) {
0f7c507f
JH
145 croak("%"SVf":Malformed HI surrogate %"UVxf,
146 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
85982a32
JH
147 ord);
148 }
149 if (s+size > e) {
150 /* Partial character */
151 s -= size; /* back up to 1st half */
152 break; /* And exit loop */
153 }
154 lo = enc_unpack(aTHX_ &s,e,size,endian);
155 if (!isLoSurrogate(lo)){
0f7c507f
JH
156 croak("%"SVf":Malformed LO surrogate %"UVxf,
157 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
85982a32
JH
158 ord);
159 }
160 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
161 }
162 }
163 d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
164 d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
165 SvCUR_set(result,d - (U8 *)SvPVX(result));
166 }
6d1c0808 167 if (s < e) {
0f7c507f
JH
168 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
169 *hv_fetch((HV *)SvRV(obj),"Name",4,0));
6d1c0808
JH
170 }
171 if (check && !(check & ENCODE_LEAVE_SRC)){
172 if (s < e) {
85982a32
JH
173 Move(s,SvPVX(str),e-s,U8);
174 SvCUR_set(str,(e-s));
175 }
176 else {
177 SvCUR_set(str,0);
178 }
179 *SvEND(str) = '\0';
180 }
181 XSRETURN(1);
182}
183
184void
6d1c0808
JH
185encode_xs(obj, utf8, check = 0)
186SV * obj
85982a32 187SV * utf8
6d1c0808 188IV check
85982a32
JH
189CODE:
190{
191 int size = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
192 U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
193 int ucs2 = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
194 SV *result = newSVpvn("",0);
195 STRLEN ulen;
196 U8 *s = (U8 *)SvPVutf8(utf8,ulen);
197 U8 *e = (U8 *)SvEND(utf8);
198 ST(0) = sv_2mortal(result);
199 if (!endian) {
200 endian = (size == 4) ? 'N' : 'n';
201 enc_pack(aTHX_ result,size,endian,BOM_BE);
202#if 0
203 /* Update endian for this sequence */
204 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
205#endif
206 }
207 while (s < e && s+UTF8SKIP(s) <= e) {
208 STRLEN len;
209 UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
210 s += len;
211 if (size != 4 && invalid_ucs2(ord)) {
212 if (!issurrogate(ord)){
213 if (ucs2) {
6d1c0808 214 if (check) {
0f7c507f
JH
215 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
216 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
85982a32
JH
217 }
218 enc_pack(aTHX_ result,size,endian,FBCHAR);
219 }else{
220 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
221 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
222 enc_pack(aTHX_ result,size,endian,hi);
223 enc_pack(aTHX_ result,size,endian,lo);
224 }
225 }
226 else {
227 /* not supposed to happen */
228 enc_pack(aTHX_ result,size,endian,FBCHAR);
229 }
230 }
231 else {
232 enc_pack(aTHX_ result,size,endian,ord);
233 }
234 }
6d1c0808 235 if (s < e) {
0f7c507f
JH
236 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
237 *hv_fetch((HV *)SvRV(obj),"Name",4,0));
6d1c0808
JH
238 }
239 if (check && !(check & ENCODE_LEAVE_SRC)){
85982a32 240 if (s < e) {
85982a32
JH
241 Move(s,SvPVX(utf8),e-s,U8);
242 SvCUR_set(utf8,(e-s));
243 }
244 else {
245 SvCUR_set(utf8,0);
246 }
247 *SvEND(utf8) = '\0';
248 }
249 XSRETURN(1);
250}
251