2 $Id: Unicode.xs,v 2.2 2006/04/06 15:44:11 dankogai Exp dankogai $
5 #define PERL_NO_GET_CONTEXT
10 #include "../Encode/encode.h"
14 #define BOM16LE 0xFFFe
15 #define BOM32LE 0xFFFe0000
16 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
17 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
18 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
19 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
22 enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
27 croak("Partial character %c",(char) endian);
47 croak("Unknown endian %c",(char) endian);
55 enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
57 U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size+1);
62 SvCUR_set(result,SvCUR(result)+size);
64 *d++ = (U8)(value & 0xFF);
70 SvCUR_set(result,SvCUR(result)+size);
73 *--d = (U8)(value & 0xFF);
78 croak("Unknown endian %c",(char) endian);
83 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
87 #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
88 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
91 decode_xs(obj, str, check = 0)
97 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
98 int size = SvIV(attr("size", 4));
99 int ucs2 = SvTRUE(attr("ucs2", 4));
100 int renewed = SvTRUE(attr("renewed", 7));
101 SV *result = newSVpvn("",0);
103 U8 *s = (U8 *)SvPVbyte(str,ulen);
104 U8 *e = (U8 *)SvEND(str);
105 ST(0) = sv_2mortal(result);
108 if (!endian && s+size <= e) {
110 endian = (size == 4) ? 'N' : 'n';
111 bom = enc_unpack(aTHX_ &s,e,size,endian);
113 if (bom == BOM16LE) {
116 else if (bom == BOM32LE) {
120 croak("%"SVf":Unrecognised BOM %"UVxf,
121 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
126 /* Update endian for next sequence */
128 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
132 while (s < e && s+size <= e) {
133 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
135 if (issurrogate(ord)) {
136 if (ucs2 || size == 4) {
138 croak("%"SVf":no surrogates allowed %"UVxf,
139 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
143 /* skip the next one as well */
144 enc_unpack(aTHX_ &s,e,size,endian);
150 if (!isHiSurrogate(ord)) {
152 croak("%"SVf":Malformed HI surrogate %"UVxf,
153 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
162 /* Partial character */
163 s -= size; /* back up to 1st half */
164 break; /* And exit loop */
166 lo = enc_unpack(aTHX_ &s,e,size,endian);
167 if (!isLoSurrogate(lo)){
169 croak("%"SVf":Malformed LO surrogate %"UVxf,
170 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
178 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
184 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
186 croak("%"SVf":Unicode character %"UVxf" is illegal",
187 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
194 d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
195 d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
196 SvCUR_set(result,d - (U8 *)SvPVX(result));
199 /* unlikely to happen because it's fixed-length -- dankogai */
200 if (check & ENCODE_WARN_ON_ERR){
201 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
202 *hv_fetch((HV *)SvRV(obj),"Name",4,0));
205 if (check && !(check & ENCODE_LEAVE_SRC)){
207 Move(s,SvPVX(str),e-s,U8);
208 SvCUR_set(str,(e-s));
219 encode_xs(obj, utf8, check = 0)
225 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
226 int size = SvIV(attr("size", 4));
227 int ucs2 = SvTRUE(attr("ucs2", 4));
228 int renewed = SvTRUE(attr("renewed", 7));
229 SV *result = newSVpvn("",0);
231 U8 *s = (U8 *)SvPVutf8(utf8,ulen);
232 U8 *e = (U8 *)SvEND(utf8);
233 ST(0) = sv_2mortal(result);
235 endian = (size == 4) ? 'N' : 'n';
236 enc_pack(aTHX_ result,size,endian,BOM_BE);
238 /* Update endian for next sequence */
240 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
244 while (s < e && s+UTF8SKIP(s) <= e) {
246 UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
248 if (size != 4 && invalid_ucs2(ord)) {
249 if (!issurrogate(ord)){
252 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
253 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
255 enc_pack(aTHX_ result,size,endian,FBCHAR);
257 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
258 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
259 enc_pack(aTHX_ result,size,endian,hi);
260 enc_pack(aTHX_ result,size,endian,lo);
264 /* not supposed to happen */
265 enc_pack(aTHX_ result,size,endian,FBCHAR);
269 enc_pack(aTHX_ result,size,endian,ord);
273 /* UTF-8 partial char happens often on PerlIO.
274 Since this is okay and normal, we do not warn.
275 But this is critical when you choose to LEAVE_SRC
276 in which case we die */
277 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)){
278 Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
279 "when CHECK = 0x%" UVuf,
280 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
284 if (check && !(check & ENCODE_LEAVE_SRC)){
286 Move(s,SvPVX(utf8),e-s,U8);
287 SvCUR_set(utf8,(e-s));