This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: useless use of void context work-around
[perl5.git] / ext / Encode / Encode.xs
CommitLineData
2c674647
JH
1#include "EXTERN.h"
2#include "perl.h"
3#include "XSUB.h"
2f2b4ff2
NIS
4#define U8 U8
5#include "encode.h"
6#include "iso8859.h"
7#include "EBCDIC.h"
8#include "Symbols.h"
2c674647 9
67e989fb 10#define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) { \
2f5768b8 11 Perl_croak(aTHX_ "panic_unimplemented"); \
4a83738a 12 return (y)0; /* fool picky compilers */ \
87714904 13 }
67e989fb
JH
14UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
15UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
16
33af2bc7 17#ifdef USE_PERLIO
72e44f29
NIS
18/* Define an encoding "layer" in the perliol.h sense.
19 The layer defined here "inherits" in an object-oriented sense from the
20 "perlio" layer with its PerlIOBuf_* "methods".
21 The implementation is particularly efficient as until Encode settles down
22 there is no point in tryint to tune it.
23
24 The layer works by overloading the "fill" and "flush" methods.
25
26 "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API
27 to convert the encoded data to UTF-8 form, then copies it back to the
28 buffer. The "base class's" read methods then see the UTF-8 data.
29
30 "flush" transforms the UTF-8 data deposited by the "base class's write
31 method in the buffer back into the encoded form using the encode OO perl API,
32 then copies data back into the buffer and calls "SUPER::flush.
33
34 Note that "flush" is _also_ called for read mode - we still do the (back)-translate
35 so that the the base class's "flush" sees the correct number of encoded chars
36 for positioning the seek pointer. (This double translation is the worst performance
37 issue - particularly with all-perl encode engine.)
38
39*/
40
41
33af2bc7
NIS
42#include "perliol.h"
43
44typedef struct
45{
46 PerlIOBuf base; /* PerlIOBuf stuff */
47 SV * bufsv;
48 SV * enc;
49} PerlIOEncode;
50
51
52IV
53PerlIOEncode_pushed(PerlIO *f, const char *mode,const char *arg,STRLEN len)
54{
55 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
56 dTHX;
57 dSP;
58 IV code;
59 code = PerlIOBuf_pushed(f,mode,Nullch,0);
60 ENTER;
61 SAVETMPS;
62 PUSHMARK(sp);
63 XPUSHs(sv_2mortal(newSVpv("Encode",0)));
64 XPUSHs(sv_2mortal(newSVpvn(arg,len)));
65 PUTBACK;
66 if (perl_call_method("getEncoding",G_SCALAR) != 1)
67 return -1;
68 SPAGAIN;
69 e->enc = POPs;
70 PUTBACK;
71 if (!SvROK(e->enc))
72 return -1;
73 SvREFCNT_inc(e->enc);
74 FREETMPS;
75 LEAVE;
76 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
77 return code;
78}
79
80IV
81PerlIOEncode_popped(PerlIO *f)
82{
83 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
84 dTHX;
85 if (e->enc)
86 {
87 SvREFCNT_dec(e->enc);
88 e->enc = Nullsv;
89 }
90 if (e->bufsv)
91 {
92 SvREFCNT_dec(e->bufsv);
93 e->bufsv = Nullsv;
94 }
95 return 0;
96}
97
98STDCHAR *
99PerlIOEncode_get_base(PerlIO *f)
100{
101 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
102 dTHX;
103 if (!e->base.bufsiz)
104 e->base.bufsiz = 1024;
105 if (!e->bufsv)
106 {
107 e->bufsv = newSV(e->base.bufsiz);
108 sv_setpvn(e->bufsv,"",0);
109 }
62e8870c 110 e->base.buf = (STDCHAR *)SvPVX(e->bufsv);
33af2bc7
NIS
111 if (!e->base.ptr)
112 e->base.ptr = e->base.buf;
113 if (!e->base.end)
114 e->base.end = e->base.buf;
115 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
116 {
117 Perl_warn(aTHX_ " ptr %p(%p)%p",
118 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
119 abort();
120 }
121 if (SvLEN(e->bufsv) < e->base.bufsiz)
122 {
123 SSize_t poff = e->base.ptr - e->base.buf;
124 SSize_t eoff = e->base.end - e->base.buf;
62e8870c 125 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,e->base.bufsiz);
33af2bc7
NIS
126 e->base.ptr = e->base.buf + poff;
127 e->base.end = e->base.buf + eoff;
128 }
129 if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv))
130 {
131 Perl_warn(aTHX_ " ptr %p(%p)%p",
132 e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv));
133 abort();
134 }
135 return e->base.buf;
136}
137
33af2bc7
NIS
138IV
139PerlIOEncode_fill(PerlIO *f)
140{
141 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
142 dTHX;
143 dSP;
144 IV code;
33af2bc7
NIS
145 code = PerlIOBuf_fill(f);
146 if (code == 0)
147 {
148 SV *uni;
72e44f29
NIS
149 STRLEN len;
150 char *s;
151 /* Set SV that is the buffer to be buf..ptr */
33af2bc7
NIS
152 SvCUR_set(e->bufsv, e->base.end - e->base.buf);
153 SvUTF8_off(e->bufsv);
154 ENTER;
155 SAVETMPS;
156 PUSHMARK(sp);
157 XPUSHs(e->enc);
158 XPUSHs(e->bufsv);
159 XPUSHs(&PL_sv_yes);
160 PUTBACK;
161 if (perl_call_method("toUnicode",G_SCALAR) != 1)
162 code = -1;
163 SPAGAIN;
164 uni = POPs;
165 PUTBACK;
72e44f29
NIS
166 /* Now get translated string (forced to UTF-8) and copy back to buffer
167 don't use sv_setsv as that may "steal" PV from returned temp
168 and so free() our known-large-enough buffer.
169 sv_setpvn() should do but let us do it long hand.
170 */
171 s = SvPVutf8(uni,len);
172 if (s != SvPVX(e->bufsv))
173 {
62e8870c 174 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29
NIS
175 Move(s,e->base.buf,len,char);
176 SvCUR_set(e->bufsv,len);
177 }
178 SvUTF8_on(e->bufsv);
179 e->base.end = e->base.buf+len;
33af2bc7
NIS
180 e->base.ptr = e->base.buf;
181 FREETMPS;
182 LEAVE;
183 }
184 return code;
185}
186
187IV
188PerlIOEncode_flush(PerlIO *f)
189{
190 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
191 IV code = 0;
192 dTHX;
193 if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF)))
194 {
195 dSP;
196 SV *str;
197 char *s;
198 STRLEN len;
72e44f29
NIS
199 SSize_t left = 0;
200 if (PerlIOBase(f)->flags & PERLIO_F_RDBUF)
201 {
202 /* This is really just a flag to see if we took all the data, if
203 we did PerlIOBase_flush avoids a seek to lower layer.
204 Need to revisit if we start getting clever with unreads or seeks-in-buffer
205 */
206 left = e->base.end - e->base.ptr;
207 }
33af2bc7
NIS
208 ENTER;
209 SAVETMPS;
210 PUSHMARK(sp);
211 XPUSHs(e->enc);
72e44f29 212 SvCUR_set(e->bufsv, e->base.ptr - e->base.buf);
33af2bc7
NIS
213 SvUTF8_on(e->bufsv);
214 XPUSHs(e->bufsv);
215 XPUSHs(&PL_sv_yes);
216 PUTBACK;
217 if (perl_call_method("fromUnicode",G_SCALAR) != 1)
218 code = -1;
219 SPAGAIN;
220 str = POPs;
221 PUTBACK;
72e44f29
NIS
222 s = SvPV(str,len);
223 if (s != SvPVX(e->bufsv))
224 {
62e8870c 225 e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len);
72e44f29
NIS
226 Move(s,e->base.buf,len,char);
227 SvCUR_set(e->bufsv,len);
228 }
33af2bc7 229 SvUTF8_off(e->bufsv);
72e44f29
NIS
230 e->base.ptr = e->base.buf+len;
231 /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */
232 e->base.end = e->base.ptr + left;
33af2bc7
NIS
233 FREETMPS;
234 LEAVE;
235 if (PerlIOBuf_flush(f) != 0)
236 code = -1;
237 }
238 return code;
239}
240
241IV
242PerlIOEncode_close(PerlIO *f)
243{
244 PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode);
245 IV code = PerlIOBase_close(f);
246 dTHX;
247 if (e->bufsv)
248 {
249 SvREFCNT_dec(e->bufsv);
250 e->bufsv = Nullsv;
251 }
252 e->base.buf = NULL;
253 e->base.ptr = NULL;
254 e->base.end = NULL;
255 PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF);
256 return code;
257}
258
72e44f29
NIS
259Off_t
260PerlIOEncode_tell(PerlIO *f)
261{
262 PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf);
263 /* Unfortunately the only way to get a postion is to back-translate,
264 the UTF8-bytes we have buf..ptr and adjust accordingly.
265 But we will try and save any unread data in case stream
266 is un-seekable.
267 */
268 if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end)
269 {
270 Size_t count = b->end - b->ptr;
271 PerlIO_push(f,&PerlIO_pending,"r",Nullch,0);
272 /* Save what we have left to read */
273 PerlIOSelf(f,PerlIOBuf)->bufsiz = count;
274 PerlIO_unread(f,b->ptr,count);
275 /* There isn't any unread data - we just saved it - so avoid the lower seek */
276 b->end = b->ptr;
277 /* Flush ourselves - now one layer down,
278 this does the back translate and adjusts position
279 */
280 PerlIO_flush(PerlIONext(f));
281 /* Set position of the saved data */
282 PerlIOSelf(f,PerlIOBuf)->posn = b->posn;
283 }
284 else
285 {
286 PerlIO_flush(f);
287 }
288 return b->posn;
289}
290
33af2bc7 291PerlIO_funcs PerlIO_encode = {
72e44f29 292 "encoding",
33af2bc7
NIS
293 sizeof(PerlIOEncode),
294 PERLIO_K_BUFFERED,
295 PerlIOBase_fileno,
296 PerlIOBuf_fdopen,
297 PerlIOBuf_open,
298 PerlIOBuf_reopen,
299 PerlIOEncode_pushed,
300 PerlIOEncode_popped,
301 PerlIOBuf_read,
302 PerlIOBuf_unread,
303 PerlIOBuf_write,
304 PerlIOBuf_seek,
72e44f29 305 PerlIOEncode_tell,
33af2bc7
NIS
306 PerlIOEncode_close,
307 PerlIOEncode_flush,
308 PerlIOEncode_fill,
309 PerlIOBase_eof,
310 PerlIOBase_error,
311 PerlIOBase_clearerr,
312 PerlIOBuf_setlinebuf,
313 PerlIOEncode_get_base,
314 PerlIOBuf_bufsiz,
315 PerlIOBuf_get_ptr,
316 PerlIOBuf_get_cnt,
317 PerlIOBuf_set_ptrcnt,
318};
319#endif
320
2f2b4ff2
NIS
321void
322Encode_Define(pTHX_ encode_t *enc)
323{
324 HV *hash = get_hv("Encode::encoding",GV_ADD|GV_ADDMULTI);
325 HV *stash = gv_stashpv("Encode::XS", TRUE);
326 SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash);
327 hv_store(hash,enc->name,strlen(enc->name),sv,0);
328}
329
183a2d84 330void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {}
67e989fb 331
2f2b4ff2
NIS
332static SV *
333encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check)
334{
335 STRLEN slen;
336 U8 *s = (U8 *) SvPV(src,slen);
337 SV *dst = sv_2mortal(newSV(2*slen+1));
338 if (slen)
339 {
340 U8 *d = (U8 *) SvGROW(dst, 2*slen+1);
341 STRLEN dlen = SvLEN(dst);
342 int code;
343 while ((code = do_encode(dir,s,&slen,d,dlen,&dlen)))
344 {
345 SvCUR_set(dst,dlen);
346 SvPOK_on(dst);
347 switch(code)
348 {
349 case ENCODE_NOSPACE:
350 {
351 STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN);
352 if (need <= SvLEN(dst))
353 need += UTF8_MAXLEN;
354 d = (U8 *) SvGROW(dst, need);
355 dlen = SvLEN(dst);
356 slen = SvCUR(src);
357 break;
358 }
359
360 case ENCODE_NOREP:
361 if (dir == enc->f_utf8)
362 {
363 if (!check && ckWARN_d(WARN_UTF8))
364 {
365 STRLEN clen;
366 UV ch = utf8_to_uv(s+slen,(SvCUR(src)-slen),&clen,0);
367 Perl_warner(aTHX_ WARN_UTF8, "\"\\x{%x}\" does not map to %s", ch, enc->name);
368 /* FIXME: Skip over the character, copy in replacement and continue
369 * but that is messy so for now just fail.
370 */
371 return &PL_sv_undef;
372 }
373 else
374 {
375 return &PL_sv_undef;
376 }
377 }
378 else
379 {
380 /* UTF-8 is supposed to be "Universal" so should not happen */
381 Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8",
382 enc->name, (SvCUR(src)-slen),s+slen);
383 }
384 break;
385
386 case ENCODE_PARTIAL:
387 if (!check && ckWARN_d(WARN_UTF8))
388 {
389 Perl_warner(aTHX_ WARN_UTF8, "Partial %s character",
390 (dir == enc->f_utf8) ? "UTF-8" : enc->name);
391 }
392 return &PL_sv_undef;
393
394 default:
395 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
396 code, (dir == enc->f_utf8) ? "to" : "from",enc->name);
397 return &PL_sv_undef;
398 }
399 }
400 SvCUR_set(dst,dlen);
401 SvPOK_on(dst);
402 if (check)
403 {
404 if (slen < SvCUR(src))
405 {
406 Move(s+slen,s,SvCUR(src)-slen,U8);
407 }
408 SvCUR_set(src,SvCUR(src)-slen);
409 }
410 }
411 return dst;
412}
413
414MODULE = Encode PACKAGE = Encode::XS PREFIX = Encode_
415
416PROTOTYPES: ENABLE
417
418void
419Encode_toUnicode(obj,src,check = 0)
420SV * obj
421SV * src
422int check
423CODE:
424 {
425 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
426 ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
427 SvUTF8_on(ST(0));
428 XSRETURN(1);
429 }
430
431void
432Encode_fromUnicode(obj,src,check = 0)
433SV * obj
434SV * src
435int check
436CODE:
437 {
438 encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
439 sv_utf8_upgrade(src);
440 ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
441 XSRETURN(1);
442 }
443
67e989fb 444MODULE = Encode PACKAGE = Encode
2c674647
JH
445
446PROTOTYPES: ENABLE
447
67e989fb 448I32
2c674647 449_bytes_to_utf8(sv, ...)
67e989fb 450 SV * sv
2c674647 451 CODE:
67e989fb
JH
452 {
453 SV * encoding = items == 2 ? ST(1) : Nullsv;
454
455 if (encoding)
456 RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
457 else {
458 STRLEN len;
183a2d84 459 U8* s = (U8*)SvPV(sv, len);
67e989fb
JH
460 U8* converted;
461
462 converted = bytes_to_utf8(s, &len); /* This allocs */
183a2d84 463 sv_setpvn(sv, (char *)converted, len);
67e989fb
JH
464 SvUTF8_on(sv); /* XXX Should we? */
465 Safefree(converted); /* ... so free it */
466 RETVAL = len;
467 }
468 }
2c674647 469 OUTPUT:
67e989fb 470 RETVAL
2c674647 471
67e989fb 472I32
2c674647 473_utf8_to_bytes(sv, ...)
67e989fb 474 SV * sv
2c674647 475 CODE:
67e989fb
JH
476 {
477 SV * to = items > 1 ? ST(1) : Nullsv;
478 SV * check = items > 2 ? ST(2) : Nullsv;
87714904 479
67e989fb
JH
480 if (to)
481 RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
482 else {
67e989fb 483 STRLEN len;
b113ac0e 484 U8 *s = (U8*)SvPV(sv, len);
67e989fb
JH
485
486 if (SvTRUE(check)) {
487 /* Must do things the slow way */
488 U8 *dest;
87714904 489 U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
67e989fb
JH
490 U8 *send = s + len;
491
492 New(83, dest, len, U8); /* I think */
493
494 while (s < send) {
495 if (*s < 0x80)
496 *dest++ = *s++;
497 else {
b113ac0e
JH
498 STRLEN ulen;
499 UV uv = *s++;
87714904 500
67e989fb
JH
501 /* Have to do it all ourselves because of error routine,
502 aargh. */
503 if (!(uv & 0x40))
504 goto failure;
505 if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; }
506 else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; }
507 else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; }
508 else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; }
509 else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; }
510 else if (!(uv & 0x01)) { ulen = 7; uv = 0; }
511 else { ulen = 13; uv = 0; }
87714904 512
67e989fb
JH
513 /* Note change to utf8.c variable naming, for variety */
514 while (ulen--) {
515 if ((*s & 0xc0) != 0x80)
516 goto failure;
87714904 517
67e989fb
JH
518 else
519 uv = (uv << 6) | (*s++ & 0x3f);
87714904 520 }
67e989fb
JH
521 if (uv > 256) {
522 failure:
523 call_failure(check, s, dest, src);
524 /* Now what happens? */
525 }
526 *dest++ = (U8)uv;
527 }
528 }
529 } else
530 RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
531 }
2c674647
JH
532 }
533 OUTPUT:
534 RETVAL
535
536SV *
537_chars_to_utf8(sv, from, ...)
538 SV * sv
539 SV * from
540 CODE:
541 {
542 SV * check = items == 3 ? ST(2) : Nullsv;
543 RETVAL = &PL_sv_undef;
544 }
545 OUTPUT:
546 RETVAL
547
548SV *
549_utf8_to_chars(sv, to, ...)
550 SV * sv
551 SV * to
552 CODE:
553 {
554 SV * check = items == 3 ? ST(2) : Nullsv;
555 RETVAL = &PL_sv_undef;
556 }
557 OUTPUT:
558 RETVAL
559
560SV *
561_utf8_to_chars_check(sv, ...)
562 SV * sv
563 CODE:
564 {
565 SV * check = items == 2 ? ST(1) : Nullsv;
566 RETVAL = &PL_sv_undef;
567 }
568 OUTPUT:
569 RETVAL
570
571SV *
572_bytes_to_chars(sv, from, ...)
573 SV * sv
574 SV * from
575 CODE:
576 {
577 SV * check = items == 3 ? ST(2) : Nullsv;
578 RETVAL = &PL_sv_undef;
579 }
580 OUTPUT:
581 RETVAL
582
583SV *
584_chars_to_bytes(sv, to, ...)
585 SV * sv
586 SV * to
587 CODE:
588 {
589 SV * check = items == 3 ? ST(2) : Nullsv;
590 RETVAL = &PL_sv_undef;
591 }
592 OUTPUT:
593 RETVAL
594
595SV *
596_from_to(sv, from, to, ...)
597 SV * sv
598 SV * from
599 SV * to
600 CODE:
601 {
602 SV * check = items == 4 ? ST(3) : Nullsv;
603 RETVAL = &PL_sv_undef;
604 }
605 OUTPUT:
606 RETVAL
607
608bool
609_is_utf8(sv, ...)
610 SV * sv
611 CODE:
612 {
613 SV * check = items == 2 ? ST(1) : Nullsv;
614 if (SvPOK(sv)) {
067a85ef 615 RETVAL = SvUTF8(sv) ? 1 : 0;
2c674647
JH
616 if (RETVAL &&
617 SvTRUE(check) &&
618 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
619 RETVAL = FALSE;
620 } else {
621 RETVAL = FALSE;
622 }
623 }
624 OUTPUT:
625 RETVAL
626
627SV *
628_on_utf8(sv)
629 SV * sv
630 CODE:
631 {
632 if (SvPOK(sv)) {
87714904 633 SV *rsv = newSViv(SvUTF8(sv));
2c674647
JH
634 RETVAL = rsv;
635 SvUTF8_on(sv);
636 } else {
637 RETVAL = &PL_sv_undef;
638 }
639 }
640 OUTPUT:
641 RETVAL
642
643SV *
644_off_utf8(sv)
645 SV * sv
646 CODE:
647 {
648 if (SvPOK(sv)) {
87714904 649 SV *rsv = newSViv(SvUTF8(sv));
2c674647
JH
650 RETVAL = rsv;
651 SvUTF8_off(sv);
652 } else {
653 RETVAL = &PL_sv_undef;
654 }
655 }
656 OUTPUT:
657 RETVAL
658
659SV *
660_utf_to_utf(sv, from, to, ...)
661 SV * sv
662 SV * from
663 SV * to
664 CODE:
665 {
666 SV * check = items == 4 ? ST(3) : Nullsv;
667 RETVAL = &PL_sv_undef;
668 }
669 OUTPUT:
670 RETVAL
671
33af2bc7
NIS
672BOOT:
673{
674#ifdef USE_PERLIO
675 PerlIO_define_layer(&PerlIO_encode);
676#endif
2f2b4ff2
NIS
677#include "iso8859.def"
678#include "EBCDIC.def"
679#include "Symbols.def"
33af2bc7 680}