2 $Id: Unicode.xs,v 2.10 2013/04/26 18:30:46 dankogai Exp $
5 #define PERL_NO_GET_CONTEXT
10 #include "../Encode/encode.h"
14 #define BOM16LE 0xFFFe
15 #define BOM32LE 0xFFFe0000
16 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
17 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
18 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
19 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
21 /* For pre-5.14 source compatibility */
22 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
23 # define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
24 # define UTF8_DISALLOW_SURROGATE 0
25 # define UTF8_WARN_SURROGATE 0
26 # define UTF8_DISALLOW_FE_FF 0
27 # define UTF8_WARN_FE_FF 0
28 # define UTF8_WARN_NONCHAR 0
31 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
33 /* Avoid wasting too much space in the result buffer */
35 /* shrink_buffer(SV *result) */
37 /* if (SvLEN(result) > 42 + SvCUR(result)) { */
39 /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
40 /* New(0, buf, len, char); */
41 /* Copy(SvPVX(result), buf, len, char); */
42 /* Safefree(SvPVX(result)); */
43 /* SvPV_set(result, buf); */
44 /* SvLEN_set(result, len); */
48 #define shrink_buffer(result) { \
49 if (SvLEN(result) > 42 + SvCUR(result)) { \
51 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
52 New(0, newpv, newlen, char); \
53 Copy(SvPVX(result), newpv, newlen, char); \
54 Safefree(SvPVX(result)); \
55 SvPV_set(result, newpv); \
56 SvLEN_set(result, newlen); \
61 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
66 croak("Partial character %c",(char) endian);
86 croak("Unknown endian %c",(char) endian);
94 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
96 U8 *d = (U8 *) SvPV_nolen(result);
102 SvCUR_set(result,SvCUR(result)+size);
104 *d++ = (U8)(value & 0xFF);
110 SvCUR_set(result,SvCUR(result)+size);
113 *--d = (U8)(value & 0xFF);
118 croak("Unknown endian %c",(char) endian);
123 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
127 #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
128 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
131 decode_xs(obj, str, check = 0)
137 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
138 int size = SvIV(attr("size", 4));
139 int ucs2 = -1; /* only needed in the event of surrogate pairs */
140 SV *result = newSVpvn("",0);
141 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
145 U8 *s = (U8 *)SvPVbyte(str,ulen);
146 U8 *e = (U8 *)SvEND(str);
147 /* Optimise for the common case of being called from PerlIOEncode_fill()
148 with a standard length buffer. In this case the result SV's buffer is
149 only used temporarily, so we can afford to allocate the maximum needed
150 and not care about unused space. */
151 const bool temp_result = (ulen == PERLIO_BUFSIZ);
153 ST(0) = sv_2mortal(result);
156 if (!endian && s+size <= e) {
158 endian = (size == 4) ? 'N' : 'n';
159 bom = enc_unpack(aTHX_ &s,e,size,endian);
161 if (bom == BOM16LE) {
164 else if (bom == BOM32LE) {
168 croak("%"SVf":Unrecognised BOM %"UVxf,
169 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
174 /* Update endian for next sequence */
175 if (SvTRUE(attr("renewed", 7))) {
176 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
182 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
184 /* Preallocate the buffer to the minimum possible space required. */
185 resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
187 resultbuf = (U8 *) SvGROW(result, resultbuflen);
189 while (s < e && s+size <= e) {
190 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
192 if (issurrogate(ord)) {
194 ucs2 = SvTRUE(attr("ucs2", 4));
196 if (ucs2 || size == 4) {
198 croak("%"SVf":no surrogates allowed %"UVxf,
199 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
206 if (!isHiSurrogate(ord)) {
208 croak("%"SVf":Malformed HI surrogate %"UVxf,
209 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
216 else if (s+size > e) {
218 if (check & ENCODE_STOP_AT_PARTIAL) {
223 croak("%"SVf":Malformed HI surrogate %"UVxf,
224 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
233 lo = enc_unpack(aTHX_ &s,e,size,endian);
234 if (!isLoSurrogate(lo)) {
236 croak("%"SVf":Malformed LO surrogate %"UVxf,
237 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
246 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
252 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
254 croak("%"SVf":Unicode character %"UVxf" is illegal",
255 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
262 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
263 /* Do not allocate >8Mb more than the minimum needed.
264 This prevents allocating too much in the rogue case of a large
265 input consisting initially of long sequence uft8-byte unicode
266 chars followed by single utf8-byte chars. */
268 fixes Unicode.xs!decode_xs n-byte heap-overflow
270 STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
271 STRLEN max_alloc = remaining + (8*1024*1024);
272 STRLEN est_alloc = remaining * UTF8_MAXLEN;
273 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
274 (est_alloc > max_alloc ? max_alloc : est_alloc);
275 resultbuf = (U8 *) SvGROW(result, newlen);
276 resultbuflen = SvLEN(result);
279 d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord,
280 UNICODE_WARN_ILLEGAL_INTERCHANGE);
281 SvCUR_set(result, d - (U8 *)SvPVX(result));
285 /* unlikely to happen because it's fixed-length -- dankogai */
286 if (check & ENCODE_WARN_ON_ERR) {
287 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
288 *hv_fetch((HV *)SvRV(obj),"Name",4,0));
291 if (check && !(check & ENCODE_LEAVE_SRC)) {
293 Move(s,SvPVX(str),e-s,U8);
294 SvCUR_set(str,(e-s));
302 if (!temp_result) shrink_buffer(result);
303 if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */
308 encode_xs(obj, utf8, check = 0)
314 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
315 const int size = SvIV(attr("size", 4));
316 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
317 const STRLEN usize = (size > 0 ? size : 1);
318 SV *result = newSVpvn("", 0);
320 U8 *s = (U8 *) SvPVutf8(utf8, ulen);
321 const U8 *e = (U8 *) SvEND(utf8);
322 /* Optimise for the common case of being called from PerlIOEncode_flush()
323 with a standard length buffer. In this case the result SV's buffer is
324 only used temporarily, so we can afford to allocate the maximum needed
325 and not care about unused space. */
326 const bool temp_result = (ulen == PERLIO_BUFSIZ);
328 ST(0) = sv_2mortal(result);
330 /* Preallocate the result buffer to the maximum possible size.
331 ie. assume each UTF8 byte is 1 character.
332 Then shrink the result's buffer if necesary at the end. */
333 SvGROW(result, ((ulen+1) * usize));
336 endian = (size == 4) ? 'N' : 'n';
337 enc_pack(aTHX_ result,size,endian,BOM_BE);
339 /* Update endian for next sequence */
340 if (SvTRUE(attr("renewed", 7))) {
341 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
345 while (s < e && s+UTF8SKIP(s) <= e) {
347 UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
351 |UTF8_WARN_NONCHAR));
353 if (size != 4 && invalid_ucs2(ord)) {
354 if (!issurrogate(ord)) {
356 ucs2 = SvTRUE(attr("ucs2", 4));
358 if (ucs2 || ord > 0x10FFFF) {
360 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
361 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
363 enc_pack(aTHX_ result,size,endian,FBCHAR);
365 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
366 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
367 enc_pack(aTHX_ result,size,endian,hi);
368 enc_pack(aTHX_ result,size,endian,lo);
372 /* not supposed to happen */
373 enc_pack(aTHX_ result,size,endian,FBCHAR);
377 enc_pack(aTHX_ result,size,endian,ord);
381 /* UTF-8 partial char happens often on PerlIO.
382 Since this is okay and normal, we do not warn.
383 But this is critical when you choose to LEAVE_SRC
384 in which case we die */
385 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
386 Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
387 "when CHECK = 0x%" UVuf,
388 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
391 if (check && !(check & ENCODE_LEAVE_SRC)) {
393 Move(s,SvPVX(utf8),e-s,U8);
394 SvCUR_set(utf8,(e-s));
402 if (!temp_result) shrink_buffer(result);
403 if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */