2 $Id: Unicode.xs,v 2.9 2012/08/05 23:08:49 dankogai Exp $
5 #define PERL_NO_GET_CONTEXT
10 #include "../Encode/encode.h"
14 #define BOM16LE 0xFFFe
15 #define BOM32LE 0xFFFe0000
16 #define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
17 #define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
18 #define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
19 #define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
21 /* For pre-5.14 source compatibility */
22 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
23 # define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
24 # define UTF8_DISALLOW_SURROGATE 0
25 # define UTF8_WARN_SURROGATE 0
26 # define UTF8_DISALLOW_FE_FF 0
27 # define UTF8_WARN_FE_FF 0
28 # define UTF8_WARN_NONCHAR 0
31 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
33 /* Avoid wasting too much space in the result buffer */
35 /* shrink_buffer(SV *result) */
37 /* if (SvLEN(result) > 42 + SvCUR(result)) { */
39 /* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
40 /* New(0, buf, len, char); */
41 /* Copy(SvPVX(result), buf, len, char); */
42 /* Safefree(SvPVX(result)); */
43 /* SvPV_set(result, buf); */
44 /* SvLEN_set(result, len); */
48 #define shrink_buffer(result) { \
49 if (SvLEN(result) > 42 + SvCUR(result)) { \
51 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
52 New(0, newpv, newlen, char); \
53 Copy(SvPVX(result), newpv, newlen, char); \
54 Safefree(SvPVX(result)); \
55 SvPV_set(result, newpv); \
56 SvLEN_set(result, newlen); \
61 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
66 croak("Partial character %c",(char) endian);
86 croak("Unknown endian %c",(char) endian);
94 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
96 U8 *d = (U8 *) SvPV_nolen(result);
102 SvCUR_set(result,SvCUR(result)+size);
104 *d++ = (U8)(value & 0xFF);
110 SvCUR_set(result,SvCUR(result)+size);
113 *--d = (U8)(value & 0xFF);
118 croak("Unknown endian %c",(char) endian);
123 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
127 #define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
128 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
131 decode_xs(obj, str, check = 0)
137 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
138 int size = SvIV(attr("size", 4));
139 int ucs2 = -1; /* only needed in the event of surrogate pairs */
140 SV *result = newSVpvn("",0);
141 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
145 U8 *s = (U8 *)SvPVbyte(str,ulen);
146 U8 *e = (U8 *)SvEND(str);
147 /* Optimise for the common case of being called from PerlIOEncode_fill()
148 with a standard length buffer. In this case the result SV's buffer is
149 only used temporarily, so we can afford to allocate the maximum needed
150 and not care about unused space. */
151 const bool temp_result = (ulen == PERLIO_BUFSIZ);
153 ST(0) = sv_2mortal(result);
156 if (!endian && s+size <= e) {
158 endian = (size == 4) ? 'N' : 'n';
159 bom = enc_unpack(aTHX_ &s,e,size,endian);
161 if (bom == BOM16LE) {
164 else if (bom == BOM32LE) {
168 croak("%"SVf":Unrecognised BOM %"UVxf,
169 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
174 /* Update endian for next sequence */
175 if (SvTRUE(attr("renewed", 7))) {
176 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
182 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
184 /* Preallocate the buffer to the minimum possible space required. */
185 resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
187 resultbuf = (U8 *) SvGROW(result, resultbuflen);
189 while (s < e && s+size <= e) {
190 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
192 if (issurrogate(ord)) {
194 ucs2 = SvTRUE(attr("ucs2", 4));
196 if (ucs2 || size == 4) {
198 croak("%"SVf":no surrogates allowed %"UVxf,
199 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
206 if (!isHiSurrogate(ord)) {
208 croak("%"SVf":Malformed HI surrogate %"UVxf,
209 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
216 else if (s+size > e) {
218 if (check & ENCODE_STOP_AT_PARTIAL) {
223 croak("%"SVf":Malformed HI surrogate %"UVxf,
224 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
233 lo = enc_unpack(aTHX_ &s,e,size,endian);
234 if (!isLoSurrogate(lo)) {
236 croak("%"SVf":Malformed LO surrogate %"UVxf,
237 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
246 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
252 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
254 croak("%"SVf":Unicode character %"UVxf" is illegal",
255 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
262 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
263 /* Do not allocate >8Mb more than the minimum needed.
264 This prevents allocating too much in the rogue case of a large
265 input consisting initially of long sequence uft8-byte unicode
266 chars followed by single utf8-byte chars. */
268 fixes Unicode.xs!decode_xs n-byte heap-overflow
270 STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
271 STRLEN max_alloc = remaining + (8*1024*1024);
272 STRLEN est_alloc = remaining * UTF8_MAXLEN;
273 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
274 (est_alloc > max_alloc ? max_alloc : est_alloc);
275 resultbuf = (U8 *) SvGROW(result, newlen);
276 resultbuflen = SvLEN(result);
279 d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord,
280 UNICODE_WARN_ILLEGAL_INTERCHANGE);
281 SvCUR_set(result, d - (U8 *)SvPVX(result));
285 /* unlikely to happen because it's fixed-length -- dankogai */
286 if (check & ENCODE_WARN_ON_ERR) {
287 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
288 *hv_fetch((HV *)SvRV(obj),"Name",4,0));
291 if (check && !(check & ENCODE_LEAVE_SRC)) {
293 Move(s,SvPVX(str),e-s,U8);
294 SvCUR_set(str,(e-s));
303 shrink_buffer(result);
309 encode_xs(obj, utf8, check = 0)
315 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
316 const int size = SvIV(attr("size", 4));
317 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
318 const STRLEN usize = (size > 0 ? size : 1);
319 SV *result = newSVpvn("", 0);
321 U8 *s = (U8 *) SvPVutf8(utf8, ulen);
322 const U8 *e = (U8 *) SvEND(utf8);
323 /* Optimise for the common case of being called from PerlIOEncode_flush()
324 with a standard length buffer. In this case the result SV's buffer is
325 only used temporarily, so we can afford to allocate the maximum needed
326 and not care about unused space. */
327 const bool temp_result = (ulen == PERLIO_BUFSIZ);
329 ST(0) = sv_2mortal(result);
331 /* Preallocate the result buffer to the maximum possible size.
332 ie. assume each UTF8 byte is 1 character.
333 Then shrink the result's buffer if necesary at the end. */
334 SvGROW(result, ((ulen+1) * usize));
337 endian = (size == 4) ? 'N' : 'n';
338 enc_pack(aTHX_ result,size,endian,BOM_BE);
340 /* Update endian for next sequence */
341 if (SvTRUE(attr("renewed", 7))) {
342 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
346 while (s < e && s+UTF8SKIP(s) <= e) {
348 UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
352 |UTF8_WARN_NONCHAR));
354 if (size != 4 && invalid_ucs2(ord)) {
355 if (!issurrogate(ord)) {
357 ucs2 = SvTRUE(attr("ucs2", 4));
359 if (ucs2 || ord > 0x10FFFF) {
361 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
362 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
364 enc_pack(aTHX_ result,size,endian,FBCHAR);
366 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
367 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
368 enc_pack(aTHX_ result,size,endian,hi);
369 enc_pack(aTHX_ result,size,endian,lo);
373 /* not supposed to happen */
374 enc_pack(aTHX_ result,size,endian,FBCHAR);
378 enc_pack(aTHX_ result,size,endian,ord);
382 /* UTF-8 partial char happens often on PerlIO.
383 Since this is okay and normal, we do not warn.
384 But this is critical when you choose to LEAVE_SRC
385 in which case we die */
386 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
387 Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
388 "when CHECK = 0x%" UVuf,
389 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
392 if (check && !(check & ENCODE_LEAVE_SRC)) {
394 Move(s,SvPVX(utf8),e-s,U8);
395 SvCUR_set(utf8,(e-s));
404 shrink_buffer(result);