2 $Id: Unicode.xs,v 2.11 2014/04/29 16:25:06 dankogai Exp dankogai $
5 #define PERL_NO_GET_CONTEXT
9 #include "../Encode/encode.h"
13 #define BOM16LE 0xFFFe
14 #define BOM32LE 0xFFFe0000
15 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
16 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
17 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
18 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
20 /* For pre-5.14 source compatibility */
21 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
22 # define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
23 # define UTF8_DISALLOW_SURROGATE 0
24 # define UTF8_WARN_SURROGATE 0
25 # define UTF8_DISALLOW_FE_FF 0
26 # define UTF8_WARN_FE_FF 0
27 # define UTF8_WARN_NONCHAR 0
30 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
32 /* Avoid wasting too much space in the result buffer */
34 /* shrink_buffer(SV *result) */
36 /* if (SvLEN(result) > 42 + SvCUR(result)) { */
38 /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
39 /* New(0, buf, len, char); */
40 /* Copy(SvPVX(result), buf, len, char); */
41 /* Safefree(SvPVX(result)); */
42 /* SvPV_set(result, buf); */
43 /* SvLEN_set(result, len); */
47 #define shrink_buffer(result) { \
48 if (SvLEN(result) > 42 + SvCUR(result)) { \
50 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
51 New(0, newpv, newlen, char); \
52 Copy(SvPVX(result), newpv, newlen, char); \
53 Safefree(SvPVX(result)); \
54 SvPV_set(result, newpv); \
55 SvLEN_set(result, newlen); \
60 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
65 croak("Partial character %c",(char) endian);
82 v |= ((UV)*s++ << 24);
85 croak("Unknown endian %c",(char) endian);
93 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
95 U8 *d = (U8 *) SvPV_nolen(result);
101 SvCUR_set(result,SvCUR(result)+size);
103 *d++ = (U8)(value & 0xFF);
109 SvCUR_set(result,SvCUR(result)+size);
112 *--d = (U8)(value & 0xFF);
117 croak("Unknown endian %c",(char) endian);
122 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
126 #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
127 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
130 decode_xs(obj, str, check = 0)
136 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
137 int size = SvIV(attr("size", 4));
138 int ucs2 = -1; /* only needed in the event of surrogate pairs */
139 SV *result = newSVpvn("",0);
140 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
144 U8 *s = (U8 *)SvPVbyte(str,ulen);
145 U8 *e = (U8 *)SvEND(str);
146 /* Optimise for the common case of being called from PerlIOEncode_fill()
147 with a standard length buffer. In this case the result SV's buffer is
148 only used temporarily, so we can afford to allocate the maximum needed
149 and not care about unused space. */
150 const bool temp_result = (ulen == PERLIO_BUFSIZ);
152 ST(0) = sv_2mortal(result);
155 if (!endian && s+size <= e) {
157 endian = (size == 4) ? 'N' : 'n';
158 bom = enc_unpack(aTHX_ &s,e,size,endian);
160 if (bom == BOM16LE) {
163 else if (bom == BOM32LE) {
167 croak("%"SVf":Unrecognised BOM %"UVxf,
168 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
173 /* Update endian for next sequence */
174 if (SvTRUE(attr("renewed", 7))) {
175 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
181 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
183 /* Preallocate the buffer to the minimum possible space required. */
184 resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
186 resultbuf = (U8 *) SvGROW(result, resultbuflen);
188 while (s < e && s+size <= e) {
189 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
191 if (issurrogate(ord)) {
193 ucs2 = SvTRUE(attr("ucs2", 4));
195 if (ucs2 || size == 4) {
197 croak("%"SVf":no surrogates allowed %"UVxf,
198 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
205 if (!isHiSurrogate(ord)) {
207 croak("%"SVf":Malformed HI surrogate %"UVxf,
208 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
215 else if (s+size > e) {
217 if (check & ENCODE_STOP_AT_PARTIAL) {
222 croak("%"SVf":Malformed HI surrogate %"UVxf,
223 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
232 lo = enc_unpack(aTHX_ &s,e,size,endian);
233 if (!isLoSurrogate(lo)) {
235 croak("%"SVf":Malformed LO surrogate %"UVxf,
236 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
245 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
251 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
253 croak("%"SVf":Unicode character %"UVxf" is illegal",
254 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
261 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
262 /* Do not allocate >8Mb more than the minimum needed.
263 This prevents allocating too much in the rogue case of a large
264 input consisting initially of long sequence uft8-byte unicode
265 chars followed by single utf8-byte chars. */
267 fixes Unicode.xs!decode_xs n-byte heap-overflow
269 STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
270 STRLEN max_alloc = remaining + (8*1024*1024);
271 STRLEN est_alloc = remaining * UTF8_MAXLEN;
272 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
273 (est_alloc > max_alloc ? max_alloc : est_alloc);
274 resultbuf = (U8 *) SvGROW(result, newlen);
275 resultbuflen = SvLEN(result);
278 d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord,
279 UNICODE_WARN_ILLEGAL_INTERCHANGE);
280 SvCUR_set(result, d - (U8 *)SvPVX(result));
284 /* unlikely to happen because it's fixed-length -- dankogai */
285 if (check & ENCODE_WARN_ON_ERR) {
286 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
287 *hv_fetch((HV *)SvRV(obj),"Name",4,0));
290 if (check && !(check & ENCODE_LEAVE_SRC)) {
292 Move(s,SvPVX(str),e-s,U8);
293 SvCUR_set(str,(e-s));
301 if (!temp_result) shrink_buffer(result);
302 if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */
307 encode_xs(obj, utf8, check = 0)
313 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
314 const int size = SvIV(attr("size", 4));
315 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
316 const STRLEN usize = (size > 0 ? size : 1);
317 SV *result = newSVpvn("", 0);
319 U8 *s = (U8 *) SvPVutf8(utf8, ulen);
320 const U8 *e = (U8 *) SvEND(utf8);
321 /* Optimise for the common case of being called from PerlIOEncode_flush()
322 with a standard length buffer. In this case the result SV's buffer is
323 only used temporarily, so we can afford to allocate the maximum needed
324 and not care about unused space. */
325 const bool temp_result = (ulen == PERLIO_BUFSIZ);
327 ST(0) = sv_2mortal(result);
329 /* Preallocate the result buffer to the maximum possible size.
330 ie. assume each UTF8 byte is 1 character.
331 Then shrink the result's buffer if necesary at the end. */
332 SvGROW(result, ((ulen+1) * usize));
335 endian = (size == 4) ? 'N' : 'n';
336 enc_pack(aTHX_ result,size,endian,BOM_BE);
338 /* Update endian for next sequence */
339 if (SvTRUE(attr("renewed", 7))) {
340 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
344 while (s < e && s+UTF8SKIP(s) <= e) {
346 UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
350 |UTF8_WARN_NONCHAR));
352 if (size != 4 && invalid_ucs2(ord)) {
353 if (!issurrogate(ord)) {
355 ucs2 = SvTRUE(attr("ucs2", 4));
357 if (ucs2 || ord > 0x10FFFF) {
359 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
360 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
362 enc_pack(aTHX_ result,size,endian,FBCHAR);
364 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
365 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
366 enc_pack(aTHX_ result,size,endian,hi);
367 enc_pack(aTHX_ result,size,endian,lo);
371 /* not supposed to happen */
372 enc_pack(aTHX_ result,size,endian,FBCHAR);
376 enc_pack(aTHX_ result,size,endian,ord);
380 /* UTF-8 partial char happens often on PerlIO.
381 Since this is okay and normal, we do not warn.
382 But this is critical when you choose to LEAVE_SRC
383 in which case we die */
384 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
385 Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
386 "when CHECK = 0x%" UVuf,
387 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
390 if (check && !(check & ENCODE_LEAVE_SRC)) {
392 Move(s,SvPVX(utf8),e-s,U8);
393 SvCUR_set(utf8,(e-s));
401 if (!temp_result) shrink_buffer(result);
402 if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */