This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Updated Encode to CPAN version 2.43
[perl5.git] / cpan / Encode / Unicode / Unicode.xs
CommitLineData
85982a32 1/*
b9370cdb 2 $Id: Unicode.xs,v 2.7 2010/12/31 22:48:48 dankogai Exp $
85982a32
JH
3 */
4
5#define PERL_NO_GET_CONTEXT
6#include "EXTERN.h"
7#include "perl.h"
8#include "XSUB.h"
6d1c0808
JH
9#define U8 U8
10#include "../Encode/encode.h"
85982a32
JH
11
12#define FBCHAR 0xFFFd
13#define BOM_BE 0xFeFF
14#define BOM16LE 0xFFFe
15#define BOM32LE 0xFFFe0000
16#define issurrogate(x) (0xD800 <= (x) && (x) <= 0xDFFF )
17#define isHiSurrogate(x) (0xD800 <= (x) && (x) < 0xDC00 )
18#define isLoSurrogate(x) (0xDC00 <= (x) && (x) <= 0xDFFF )
19#define invalid_ucs2(x) ( issurrogate(x) || 0xFFFF < (x) )
20
b85802c5
CBW
21/* For pre-5.14 source compatibility */
22#ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
23# define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
24# define UTF8_DISALLOW_SURROGATE 0
25# define UTF8_WARN_SURROGATE 0
26# define UTF8_DISALLOW_FE_FF 0
27# define UTF8_WARN_FE_FF 0
28# define UTF8_WARN_NONCHAR 0
29#endif
30
64bc6d54
SH
31#define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
32
33/* Avoid wasting too much space in the result buffer */
a37eaad4
SH
34/* static void */
35/* shrink_buffer(SV *result) */
36/* { */
37/* if (SvLEN(result) > 42 + SvCUR(result)) { */
38/* char *buf; */
39/* STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
40/* New(0, buf, len, char); */
41/* Copy(SvPVX(result), buf, len, char); */
42/* Safefree(SvPVX(result)); */
43/* SvPV_set(result, buf); */
44/* SvLEN_set(result, len); */
45/* } */
46/* } */
47
48#define shrink_buffer(result) { \
49 if (SvLEN(result) > 42 + SvCUR(result)) { \
50 char *newpv; \
51 STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
52 New(0, newpv, newlen, char); \
53 Copy(SvPVX(result), newpv, newlen, char); \
54 Safefree(SvPVX(result)); \
55 SvPV_set(result, newpv); \
56 SvLEN_set(result, newlen); \
57 } \
64bc6d54
SH
58}
59
85982a32 60static UV
64bc6d54 61enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
85982a32
JH
62{
63 U8 *s = *sp;
64 UV v = 0;
65 if (s+size > e) {
64bc6d54 66 croak("Partial character %c",(char) endian);
85982a32
JH
67 }
68 switch(endian) {
69 case 'N':
64bc6d54
SH
70 v = *s++;
71 v = (v << 8) | *s++;
85982a32 72 case 'n':
64bc6d54
SH
73 v = (v << 8) | *s++;
74 v = (v << 8) | *s++;
75 break;
85982a32
JH
76 case 'V':
77 case 'v':
64bc6d54
SH
78 v |= *s++;
79 v |= (*s++ << 8);
80 if (endian == 'v')
81 break;
82 v |= (*s++ << 16);
83 v |= (*s++ << 24);
84 break;
85982a32 85 default:
64bc6d54
SH
86 croak("Unknown endian %c",(char) endian);
87 break;
85982a32
JH
88 }
89 *sp = s;
90 return v;
91}
92
93void
64bc6d54 94enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
85982a32 95{
64bc6d54
SH
96 U8 *d = (U8 *) SvPV_nolen(result);
97
85982a32
JH
98 switch(endian) {
99 case 'v':
100 case 'V':
64bc6d54
SH
101 d += SvCUR(result);
102 SvCUR_set(result,SvCUR(result)+size);
103 while (size--) {
104 *d++ = (U8)(value & 0xFF);
105 value >>= 8;
106 }
107 break;
85982a32
JH
108 case 'n':
109 case 'N':
64bc6d54
SH
110 SvCUR_set(result,SvCUR(result)+size);
111 d += SvCUR(result);
112 while (size--) {
113 *--d = (U8)(value & 0xFF);
114 value >>= 8;
115 }
116 break;
85982a32 117 default:
64bc6d54
SH
118 croak("Unknown endian %c",(char) endian);
119 break;
85982a32
JH
120 }
121}
122
123MODULE = Encode::Unicode PACKAGE = Encode::Unicode
124
6d1c0808
JH
125PROTOTYPES: DISABLE
126
a0d8a30e
DK
127#define attr(k, l) (hv_exists((HV *)SvRV(obj),k,l) ? \
128 *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
129
85982a32 130void
6d1c0808 131decode_xs(obj, str, check = 0)
85982a32
JH
132SV * obj
133SV * str
6d1c0808 134IV check
85982a32
JH
135CODE:
136{
64bc6d54
SH
137 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
138 int size = SvIV(attr("size", 4));
139 int ucs2 = -1; /* only needed in the event of surrogate pairs */
140 SV *result = newSVpvn("",0);
141 STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
85982a32 142 STRLEN ulen;
64bc6d54
SH
143 STRLEN resultbuflen;
144 U8 *resultbuf;
85982a32
JH
145 U8 *s = (U8 *)SvPVbyte(str,ulen);
146 U8 *e = (U8 *)SvEND(str);
64bc6d54
SH
147 /* Optimise for the common case of being called from PerlIOEncode_fill()
148 with a standard length buffer. In this case the result SV's buffer is
149 only used temporarily, so we can afford to allocate the maximum needed
150 and not care about unused space. */
151 const bool temp_result = (ulen == PERLIO_BUFSIZ);
152
85982a32
JH
153 ST(0) = sv_2mortal(result);
154 SvUTF8_on(result);
155
156 if (!endian && s+size <= e) {
64bc6d54
SH
157 UV bom;
158 endian = (size == 4) ? 'N' : 'n';
159 bom = enc_unpack(aTHX_ &s,e,size,endian);
160 if (bom != BOM_BE) {
161 if (bom == BOM16LE) {
162 endian = 'v';
163 }
164 else if (bom == BOM32LE) {
165 endian = 'V';
166 }
167 else {
168 croak("%"SVf":Unrecognised BOM %"UVxf,
169 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
170 bom);
171 }
172 }
a0d8a30e 173#if 1
64bc6d54
SH
174 /* Update endian for next sequence */
175 if (SvTRUE(attr("renewed", 7))) {
176 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
177 }
85982a32
JH
178#endif
179 }
0a8c69ed 180
64bc6d54
SH
181 if (temp_result) {
182 resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
183 } else {
184 /* Preallocate the buffer to the minimum possible space required. */
185 resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
d1256cb1 186 }
64bc6d54 187 resultbuf = (U8 *) SvGROW(result, resultbuflen);
0a8c69ed 188
64bc6d54
SH
189 while (s < e && s+size <= e) {
190 UV ord = enc_unpack(aTHX_ &s,e,size,endian);
191 U8 *d;
192 if (issurrogate(ord)) {
193 if (ucs2 == -1) {
194 ucs2 = SvTRUE(attr("ucs2", 4));
195 }
196 if (ucs2 || size == 4) {
197 if (check) {
198 croak("%"SVf":no surrogates allowed %"UVxf,
199 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
200 ord);
201 }
202 if (s+size <= e) {
203 /* skip the next one as well */
204 enc_unpack(aTHX_ &s,e,size,endian);
205 }
206 ord = FBCHAR;
207 }
208 else {
209 UV lo;
210 if (!isHiSurrogate(ord)) {
211 if (check) {
212 croak("%"SVf":Malformed HI surrogate %"UVxf,
213 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
214 ord);
215 }
216 else {
217 ord = FBCHAR;
218 }
219 }
220 else {
221 if (s+size > e) {
222 /* Partial character */
223 s -= size; /* back up to 1st half */
224 break; /* And exit loop */
225 }
226 lo = enc_unpack(aTHX_ &s,e,size,endian);
227 if (!isLoSurrogate(lo)) {
228 if (check) {
229 croak("%"SVf":Malformed LO surrogate %"UVxf,
230 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
231 ord);
232 }
233 else {
234 ord = FBCHAR;
235 }
236 }
237 else {
238 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
239 }
240 }
241 }
242 }
243
244 if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
245 if (check) {
246 croak("%"SVf":Unicode character %"UVxf" is illegal",
247 *hv_fetch((HV *)SvRV(obj),"Name",4,0),
248 ord);
249 } else {
250 ord = FBCHAR;
251 }
252 }
253
254 if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
255 /* Do not allocate >8Mb more than the minimum needed.
256 This prevents allocating too much in the rogue case of a large
257 input consisting initially of long sequence uft8-byte unicode
258 chars followed by single utf8-byte chars. */
259 STRLEN remaining = (e - s)/usize;
260 STRLEN max_alloc = remaining + (8*1024*1024);
261 STRLEN est_alloc = remaining * UTF8_MAXLEN;
262 STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
263 (est_alloc > max_alloc ? max_alloc : est_alloc);
264 resultbuf = (U8 *) SvGROW(result, newlen);
265 resultbuflen = SvLEN(result);
266 }
267
b85802c5
CBW
268 d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord,
269 UNICODE_WARN_ILLEGAL_INTERCHANGE);
64bc6d54 270 SvCUR_set(result, d - (U8 *)SvPVX(result));
6d1c0808 271 }
64bc6d54 272
d1256cb1 273 if (s < e) {
64bc6d54
SH
274 /* unlikely to happen because it's fixed-length -- dankogai */
275 if (check & ENCODE_WARN_ON_ERR) {
276 Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
277 *hv_fetch((HV *)SvRV(obj),"Name",4,0));
278 }
d1256cb1 279 }
64bc6d54
SH
280 if (check && !(check & ENCODE_LEAVE_SRC)) {
281 if (s < e) {
282 Move(s,SvPVX(str),e-s,U8);
283 SvCUR_set(str,(e-s));
284 }
285 else {
286 SvCUR_set(str,0);
287 }
288 *SvEND(str) = '\0';
85982a32 289 }
64bc6d54
SH
290
291 if (!temp_result)
292 shrink_buffer(result);
293
85982a32
JH
294 XSRETURN(1);
295}
296
297void
6d1c0808
JH
298encode_xs(obj, utf8, check = 0)
299SV * obj
85982a32 300SV * utf8
6d1c0808 301IV check
85982a32
JH
302CODE:
303{
64bc6d54
SH
304 U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
305 const int size = SvIV(attr("size", 4));
306 int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
307 const STRLEN usize = (size > 0 ? size : 1);
308 SV *result = newSVpvn("", 0);
85982a32 309 STRLEN ulen;
64bc6d54
SH
310 U8 *s = (U8 *) SvPVutf8(utf8, ulen);
311 const U8 *e = (U8 *) SvEND(utf8);
312 /* Optimise for the common case of being called from PerlIOEncode_flush()
313 with a standard length buffer. In this case the result SV's buffer is
314 only used temporarily, so we can afford to allocate the maximum needed
315 and not care about unused space. */
316 const bool temp_result = (ulen == PERLIO_BUFSIZ);
317
85982a32 318 ST(0) = sv_2mortal(result);
64bc6d54
SH
319
320 /* Preallocate the result buffer to the maximum possible size.
321 ie. assume each UTF8 byte is 1 character.
322 Then shrink the result's buffer if necesary at the end. */
323 SvGROW(result, ((ulen+1) * usize));
324
85982a32 325 if (!endian) {
64bc6d54
SH
326 endian = (size == 4) ? 'N' : 'n';
327 enc_pack(aTHX_ result,size,endian,BOM_BE);
a0d8a30e 328#if 1
64bc6d54
SH
329 /* Update endian for next sequence */
330 if (SvTRUE(attr("renewed", 7))) {
331 hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
332 }
85982a32
JH
333#endif
334 }
335 while (s < e && s+UTF8SKIP(s) <= e) {
64bc6d54 336 STRLEN len;
b85802c5
CBW
337 UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
338 |UTF8_WARN_SURROGATE
339 |UTF8_DISALLOW_FE_FF
340 |UTF8_WARN_FE_FF
341 |UTF8_WARN_NONCHAR));
64bc6d54
SH
342 s += len;
343 if (size != 4 && invalid_ucs2(ord)) {
344 if (!issurrogate(ord)) {
345 if (ucs2 == -1) {
346 ucs2 = SvTRUE(attr("ucs2", 4));
347 }
348 if (ucs2) {
349 if (check) {
350 croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
351 *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
352 }
353 enc_pack(aTHX_ result,size,endian,FBCHAR);
354 } else {
355 UV hi = ((ord - 0x10000) >> 10) + 0xD800;
356 UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
357 enc_pack(aTHX_ result,size,endian,hi);
358 enc_pack(aTHX_ result,size,endian,lo);
359 }
360 }
361 else {
362 /* not supposed to happen */
363 enc_pack(aTHX_ result,size,endian,FBCHAR);
364 }
365 }
366 else {
367 enc_pack(aTHX_ result,size,endian,ord);
368 }
85982a32 369 }
6d1c0808 370 if (s < e) {
64bc6d54
SH
371 /* UTF-8 partial char happens often on PerlIO.
372 Since this is okay and normal, we do not warn.
373 But this is critical when you choose to LEAVE_SRC
374 in which case we die */
375 if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
376 Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
377 "when CHECK = 0x%" UVuf,
378 *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
379 }
d1256cb1 380 }
64bc6d54
SH
381 if (check && !(check & ENCODE_LEAVE_SRC)) {
382 if (s < e) {
383 Move(s,SvPVX(utf8),e-s,U8);
384 SvCUR_set(utf8,(e-s));
385 }
386 else {
387 SvCUR_set(utf8,0);
388 }
389 *SvEND(utf8) = '\0';
d1256cb1 390 }
64bc6d54
SH
391
392 if (!temp_result)
393 shrink_buffer(result);
394
b2deda17
JV
395 SvSETMAGIC(utf8);
396
85982a32
JH
397 XSRETURN(1);
398}