This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Encode 1.42, from Dan Kogai.
[perl5.git] / ext / Encode / Encode.xs
1 #define PERL_NO_GET_CONTEXT
2 #include "EXTERN.h"
3 #include "perl.h"
4 #include "XSUB.h"
5 #define U8 U8
6 #include "encode.h"
7 #include "def_t.h"
8
9 #define FBCHAR                  0xFFFd
10 #define FBCHAR_UTF8             "\xEF\xBF\xBD"
11 #define BOM_BE                  0xFeFF
12 #define BOM16LE                 0xFFFe
13 #define BOM32LE                 0xFFFe0000
14 #define issurrogate(x)          (0xD800 <= (x)  && (x) <= 0xDFFF )
15 #define isHiSurrogate(x)        (0xD800 <= (x)  && (x) <  0xDC00 )
16 #define isLoSurrogate(x)        (0xDC00 <= (x)  && (x) <= 0xDFFF )
17 #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
18
19 static UV
20 enc_unpack(pTHX_ U8 **sp,U8 *e,STRLEN size,U8 endian)
21 {
22     U8 *s = *sp;
23     UV v = 0;
24     if (s+size > e) {
25         croak("Partial character %c",(char) endian);
26     }
27     switch(endian) {
28         case 'N':
29             v = *s++;
30             v = (v << 8) | *s++;
31         case 'n':
32             v = (v << 8) | *s++;
33             v = (v << 8) | *s++;
34             break;
35         case 'V':
36         case 'v':
37             v |= *s++;
38             v |= (*s++ << 8);
39             if (endian == 'v')
40                 break;
41             v |= (*s++ << 16);
42             v |= (*s++ << 24);
43             break;
44         default:
45             croak("Unknown endian %c",(char) endian);
46             break;
47     }
48     *sp = s;
49     return v;
50 }
51
52 void
53 enc_pack(pTHX_ SV *result,STRLEN size,U8 endian,UV value)
54 {
55     U8 *d = (U8 *)SvGROW(result,SvCUR(result)+size);
56     switch(endian) {
57         case 'v':
58         case 'V':
59             d += SvCUR(result);
60             SvCUR_set(result,SvCUR(result)+size);
61             while (size--) {
62                 *d++ = value & 0xFF;
63                 value >>= 8;
64             }
65             break;
66         case 'n':
67         case 'N':
68             SvCUR_set(result,SvCUR(result)+size);
69             d += SvCUR(result);
70             while (size--) {
71                 *--d = value & 0xFF;
72                 value >>= 8;
73             }
74             break;
75         default:
76             croak("Unknown endian %c",(char) endian);
77             break;
78     }
79 }
80
81 #define ENCODE_XS_PROFILE 0 /* set 1 or more to profile.
82                                t/encoding.t dumps core because of
83                                Perl_warner and PerlIO don't work well */
84
85 #define ENCODE_XS_USEFP   1 /* set 0 to disable floating point to calculate
86                                buffer size for encode_method().
87                                1 is recommended. 2 restores NI-S original  */
88
89 #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX;   \
90                          Perl_croak(aTHX_ "panic_unimplemented"); \
91                          return (y)0; /* fool picky compilers */ \
92                          }
93 UNIMPLEMENTED(_encoded_utf8_to_bytes, I32)
94     UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
95
96 void
97 Encode_XSEncoding(pTHX_ encode_t * enc)
98 {
99     dSP;
100     HV *stash = gv_stashpv("Encode::XS", TRUE);
101     SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash);
102     int i = 0;
103     PUSHMARK(sp);
104     XPUSHs(sv);
105     while (enc->name[i]) {
106         const char *name = enc->name[i++];
107         XPUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
108     }
109     PUTBACK;
110     call_pv("Encode::define_encoding", G_DISCARD);
111     SvREFCNT_dec(sv);
112 }
113
114 void
115 call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig)
116 {
117  /* Exists for breakpointing */
118 }
119
120 static SV *
121 encode_method(pTHX_ encode_t * enc, encpage_t * dir, SV * src,
122                          int check)
123 {
124     STRLEN slen;
125     U8 *s = (U8 *) SvPV(src, slen);
126     STRLEN tlen  = slen;
127     STRLEN ddone = 0;
128     STRLEN sdone = 0;
129
130     /* We allocate slen+1.
131         PerlIO dumps core if this value is smaller than this. */
132     SV *dst = sv_2mortal(newSV(slen+1));
133     if (slen) {
134         U8 *d = (U8 *) SvPVX(dst);
135         STRLEN dlen = SvLEN(dst)-1;
136         int code;
137         while ((code = do_encode(dir, s, &slen, d, dlen, &dlen, !check))) {
138             SvCUR_set(dst, dlen+ddone);
139             SvPOK_only(dst);
140
141 #if ENCODE_XS_PROFILE >= 3
142             Perl_warn(aTHX_ "code=%d @ s=%d/%d/%d d=%d/%d/%d\n",code,slen,sdone,tlen,dlen,ddone,SvLEN(dst)-1);
143 #endif
144         
145             if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL)
146                 break;
147
148             switch (code) {
149             case ENCODE_NOSPACE:
150             {   
151                     STRLEN more = 0; /* make sure you initialize! */
152                     STRLEN sleft;
153                     sdone += slen;
154                     ddone += dlen;
155                     sleft = tlen - sdone;
156 #if ENCODE_XS_PROFILE >= 2
157                   Perl_warn(aTHX_
158                   "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
159                             more, sdone, sleft, SvLEN(dst));
160 #endif
161                     if (sdone != 0) { /* has src ever been processed ? */
162 #if   ENCODE_XS_USEFP == 2
163                             more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone
164                                     - SvLEN(dst);
165 #elif ENCODE_XS_USEFP
166                             more = (1.0*SvLEN(dst)+1)/sdone * sleft;
167 #else
168                             /* safe until SvLEN(dst) == MAX_INT/16 */
169                             more = (16*SvLEN(dst)+1)/sdone/16 * sleft;
170 #endif
171                     }
172                     more += UTF8_MAXLEN; /* insurance policy */
173 #if ENCODE_XS_PROFILE >= 2
174                   Perl_warn(aTHX_
175                   "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n",
176                             more, sdone, sleft, SvLEN(dst));
177 #endif
178                     d = (U8 *) SvGROW(dst, SvLEN(dst) + more);
179                     /* dst need to grow need MORE bytes! */
180                     if (ddone >= SvLEN(dst)) {
181                         Perl_croak(aTHX_ "Destination couldn't be grown.");
182                     }
183                     dlen = SvLEN(dst)-ddone-1;
184                     d   += ddone;
185                     s   += slen;
186                     slen = tlen-sdone;
187                     continue;
188             }
189
190             case ENCODE_NOREP:
191                 if (dir == enc->f_utf8) {
192                     STRLEN clen;
193                     UV ch =
194                         utf8n_to_uvuni(s + slen, (SvCUR(src) - slen),
195                                        &clen, 0);
196                     if (!check) { /* fallback char */
197                         sdone += slen + clen;
198                         ddone += dlen + enc->replen; 
199                         sv_catpvn(dst, enc->rep, enc->replen); 
200                     }
201                     else if (check == -1){ /* perlqq */
202                         SV* perlqq = 
203                             sv_2mortal(newSVpvf("\\x{%x}", ch));
204                        sdone += slen + clen;
205                        ddone += dlen + SvLEN(perlqq);
206                        sv_catsv(dst, perlqq);
207                     }                   
208                     else { 
209                           Perl_croak(aTHX_ 
210                                      "\"\\N{U+%" UVxf
211                                      "}\" does not map to %s", ch,
212                                         enc->name[0]);
213                     }
214             }
215             else {
216                 if (!check){  /* fallback char */
217                     sdone += slen + 1;
218                     ddone += dlen + strlen(FBCHAR_UTF8); 
219                     sv_catpv(dst, FBCHAR_UTF8); 
220                 }
221                 else if (check == -1){ /* perlqq */
222                     SV* perlqq = 
223                             sv_2mortal(newSVpvf("\\x%02X", s[slen]));
224                      sdone += slen + 1;
225                      ddone += dlen + SvLEN(perlqq);
226                      sv_catsv(dst, perlqq);
227                 }
228                 else {
229                     /* UTF-8 is supposed to be "Universal" so should not
230                 happen for real characters, but some encodings
231                     have non-assigned codes which may occur. */
232                         Perl_croak(aTHX_ "%s \"\\x%02X\" "
233                                            "does not map to Unicode (%d)",
234                                            enc->name[0], (U8) s[slen], code);
235                 }
236             }
237             dlen = SvCUR(dst); 
238             d   = SvPVX(dst) + dlen; 
239             s   = SvPVX(src) + sdone; 
240             slen = tlen - sdone;
241             break;
242
243             default:
244                 Perl_croak(aTHX_ "Unexpected code %d converting %s %s",
245                            code, (dir == enc->f_utf8) ? "to" : "from",
246                            enc->name[0]);
247                 return &PL_sv_undef;
248             }
249         }
250         SvCUR_set(dst, dlen+ddone);
251         SvPOK_only(dst);
252         if (check) {
253             sdone = SvCUR(src) - (slen+sdone);
254             if (sdone) {
255 #if 1
256                 /* FIXME: A Move() is dangerous - PV could be mmap'ed readonly
257                    SvOOK would be ideal - but sv_backoff does not understand SvLEN == 0
258                    type SVs and sv_clear() calls it ...
259                  */
260                  sv_setpvn(src, (char*)s+slen, sdone);
261 #else
262                 Move(s + slen, SvPVX(src), sdone , U8);
263 #endif
264             }
265             SvCUR_set(src, sdone);
266         }
267     }
268     else {
269         SvCUR_set(dst, 0);
270         SvPOK_only(dst);
271     }
272 #if ENCODE_XS_PROFILE
273     if (SvCUR(dst) > SvCUR(src)){
274             Perl_warn(aTHX_
275                       "SvLEN(dst)=%d, SvCUR(dst)=%d. "
276                       "%d bytes unused(%f %%)\n",
277                       SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst),
278                       (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0);
279         
280     }
281 #endif
282     *SvEND(dst) = '\0';
283     return dst;
284 }
285
286 MODULE = Encode         PACKAGE = Encode::XS    PREFIX = Method_
287
288 PROTOTYPES: ENABLE
289
290 void
291 Method_name(obj)
292 SV *    obj
293 CODE:
294  {
295   encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
296   ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0])));
297   XSRETURN(1);
298  }
299
300 void
301 Method_decode(obj,src,check = 0)
302 SV *    obj
303 SV *    src
304 int     check
305 CODE:
306  {
307   encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
308   ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check);
309   SvUTF8_on(ST(0));
310   XSRETURN(1);
311  }
312
313 void
314 Method_encode(obj,src,check = 0)
315 SV *    obj
316 SV *    src
317 int     check
318 CODE:
319  {
320   encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj)));
321   sv_utf8_upgrade(src);
322   ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check);
323   XSRETURN(1);
324  }
325
326 MODULE = Encode         PACKAGE = Encode::Unicode
327
328 void
329 decode_xs(obj, str, chk = &PL_sv_undef)
330 SV *    obj
331 SV *    str
332 SV *    chk
333 CODE:
334 {
335     int size    = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
336     U8 endian   = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
337     int ucs2    = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
338     SV *result = newSVpvn("",0);
339     STRLEN ulen;
340     U8 *s = (U8 *)SvPVbyte(str,ulen);
341     U8 *e = (U8 *)SvEND(str);
342     ST(0) = sv_2mortal(result);
343     SvUTF8_on(result);
344
345     if (!endian && s+size <= e) {
346         UV bom;
347         endian = (size == 4) ? 'N' : 'n';
348         bom = enc_unpack(aTHX_ &s,e,size,endian);
349         if (bom != BOM_BE) {
350             if (bom == BOM16LE) {
351                 endian = 'v';
352             }
353             else if (bom == BOM32LE) {
354                 endian = 'V';
355             }
356             else {
357                 croak("%s:Unregognised BOM %"UVxf,
358                       SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),bom);
359             }
360         }
361 #if 0
362         /* Update endian for this sequence */
363         hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
364 #endif
365     }
366     while (s < e && s+size <= e) {
367         UV ord = enc_unpack(aTHX_ &s,e,size,endian);
368         U8 *d;
369        if (size != 4 && invalid_ucs2(ord)) {
370             if (ucs2) {
371                 if (SvTRUE(chk)) {
372                     croak("%s:no surrogates allowed %"UVxf,
373                           SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
374                 }
375                 if (s+size <= e) {
376                      enc_unpack(aTHX_ &s,e,size,endian); /* skip the next one as well */
377                 }
378                 ord = FBCHAR;
379             }
380             else {
381                 UV lo;
382                 if (!isHiSurrogate(ord)) {
383                     croak("%s:Malformed HI surrogate %"UVxf,
384                           SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
385                 }
386                 if (s+size > e) {
387                     /* Partial character */
388                     s -= size;   /* back up to 1st half */
389                     break;       /* And exit loop */
390                 }
391                 lo = enc_unpack(aTHX_ &s,e,size,endian);
392                 if (!isLoSurrogate(lo)){
393                     croak("%s:Malformed LO surrogate %"UVxf,
394                           SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
395                 }
396                 ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
397             }
398         }
399         d = (U8 *) SvGROW(result,SvCUR(result)+UTF8_MAXLEN+1);
400         d = uvuni_to_utf8_flags(d+SvCUR(result), ord, 0);
401         SvCUR_set(result,d - (U8 *)SvPVX(result));
402     }
403     if (SvTRUE(chk)) {
404         if (s < e) {
405              Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
406                           SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
407              Move(s,SvPVX(str),e-s,U8);
408              SvCUR_set(str,(e-s));
409         }
410         else {
411             SvCUR_set(str,0);
412         }
413         *SvEND(str) = '\0';
414     }
415     XSRETURN(1);
416 }
417
418 void
419 encode_xs(obj, utf8, chk = &PL_sv_undef)
420 SV *    obj
421 SV *    utf8
422 SV *    chk
423 CODE:
424 {
425     int size   = SvIV(*hv_fetch((HV *)SvRV(obj),"size",4,0));
426     U8 endian = *((U8 *)SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"endian",6,0)));
427     int ucs2   = SvTRUE(*hv_fetch((HV *)SvRV(obj),"ucs2",4,0));
428     SV *result = newSVpvn("",0);
429     STRLEN ulen;
430     U8 *s = (U8 *)SvPVutf8(utf8,ulen);
431     U8 *e = (U8 *)SvEND(utf8);
432     ST(0) = sv_2mortal(result);
433     if (!endian) {
434         endian = (size == 4) ? 'N' : 'n';
435         enc_pack(aTHX_ result,size,endian,BOM_BE);
436 #if 0
437         /* Update endian for this sequence */
438         hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
439 #endif
440     }
441     while (s < e && s+UTF8SKIP(s) <= e) {
442         STRLEN len;
443         UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
444         s += len;
445        if (size != 4 && invalid_ucs2(ord)) {
446             if (!issurrogate(ord)){
447                 if (ucs2) {
448                     if (SvTRUE(chk)) {
449                         croak("%s:code point \"\\x{"UVxf"}\" too high",
450                           SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)),ord);
451                     }
452                     enc_pack(aTHX_ result,size,endian,FBCHAR);
453                 }else{
454                     UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
455                     UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
456                     enc_pack(aTHX_ result,size,endian,hi);
457                     enc_pack(aTHX_ result,size,endian,lo);
458                 }
459             }
460             else {
461                 /* not supposed to happen */
462                 enc_pack(aTHX_ result,size,endian,FBCHAR);
463             }
464         }
465         else {
466             enc_pack(aTHX_ result,size,endian,ord);
467         }
468     }
469     if (SvTRUE(chk)) {
470         if (s < e) {
471              Perl_warner(aTHX_ packWARN(WARN_UTF8),"%s:Partial character",
472                           SvPV_nolen(*hv_fetch((HV *)SvRV(obj),"Name",4,0)));
473              Move(s,SvPVX(utf8),e-s,U8);
474              SvCUR_set(utf8,(e-s));
475         }
476         else {
477             SvCUR_set(utf8,0);
478         }
479         *SvEND(utf8) = '\0';
480     }
481     XSRETURN(1);
482 }
483
484 MODULE = Encode         PACKAGE = Encode
485
486 PROTOTYPES: ENABLE
487
488 I32
489 _bytes_to_utf8(sv, ...)
490         SV *    sv
491       CODE:
492         {
493           SV * encoding = items == 2 ? ST(1) : Nullsv;
494
495           if (encoding)
496             RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding));
497           else {
498             STRLEN len;
499             U8*    s = (U8*)SvPV(sv, len);
500             U8*    converted;
501
502             converted = bytes_to_utf8(s, &len); /* This allocs */
503             sv_setpvn(sv, (char *)converted, len);
504             SvUTF8_on(sv); /* XXX Should we? */
505             Safefree(converted);                /* ... so free it */
506             RETVAL = len;
507           }
508         }
509       OUTPUT:
510         RETVAL
511
512 I32
513 _utf8_to_bytes(sv, ...)
514         SV *    sv
515       CODE:
516         {
517           SV * to    = items > 1 ? ST(1) : Nullsv;
518           SV * check = items > 2 ? ST(2) : Nullsv;
519
520           if (to)
521             RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to));
522           else {
523             STRLEN len;
524             U8 *s = (U8*)SvPV(sv, len);
525
526             RETVAL = 0;
527             if (SvTRUE(check)) {
528               /* Must do things the slow way */
529               U8 *dest;
530               U8 *src  = (U8*)savepv((char *)s); /* We need a copy to pass to check() */
531               U8 *send = s + len;
532
533               New(83, dest, len, U8); /* I think */
534
535               while (s < send) {
536                 if (*s < 0x80)
537                   *dest++ = *s++;
538                 else {
539                   STRLEN ulen;
540                   UV uv = *s++;
541
542                   /* Have to do it all ourselves because of error routine,
543                      aargh. */
544                   if (!(uv & 0x40))
545                     goto failure;
546                   if      (!(uv & 0x20)) { ulen = 2;  uv &= 0x1f; }
547                   else if (!(uv & 0x10)) { ulen = 3;  uv &= 0x0f; }
548                   else if (!(uv & 0x08)) { ulen = 4;  uv &= 0x07; }
549                   else if (!(uv & 0x04)) { ulen = 5;  uv &= 0x03; }
550                   else if (!(uv & 0x02)) { ulen = 6;  uv &= 0x01; }
551                   else if (!(uv & 0x01)) { ulen = 7;  uv = 0; }
552                   else                   { ulen = 13; uv = 0; }
553                 
554                   /* Note change to utf8.c variable naming, for variety */
555                   while (ulen--) {
556                     if ((*s & 0xc0) != 0x80)
557                       goto failure;
558                 
559                     else
560                       uv = (uv << 6) | (*s++ & 0x3f);
561                   }
562                   if (uv > 256) {
563                   failure:
564                     call_failure(check, s, dest, src);
565                     /* Now what happens? */
566                   }
567                   *dest++ = (U8)uv;
568                }
569                }
570             } else
571               RETVAL = (utf8_to_bytes(s, &len) ? len : 0);
572           }
573         }
574       OUTPUT:
575         RETVAL
576
577 bool
578 is_utf8(sv, check = 0)
579 SV *    sv
580 int     check
581       CODE:
582         {
583           if (SvGMAGICAL(sv)) /* it could be $1, for example */
584             sv = newSVsv(sv); /* GMAGIG will be done */
585           if (SvPOK(sv)) {
586             RETVAL = SvUTF8(sv) ? TRUE : FALSE;
587             if (RETVAL &&
588                 check  &&
589                 !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv)))
590               RETVAL = FALSE;
591           } else {
592             RETVAL = FALSE;
593           }
594           if (sv != ST(0))
595             SvREFCNT_dec(sv); /* it was a temp copy */
596         }
597       OUTPUT:
598         RETVAL
599
600 SV *
601 _utf8_on(sv)
602         SV *    sv
603       CODE:
604         {
605           if (SvPOK(sv)) {
606             SV *rsv = newSViv(SvUTF8(sv));
607             RETVAL = rsv;
608             SvUTF8_on(sv);
609           } else {
610             RETVAL = &PL_sv_undef;
611           }
612         }
613       OUTPUT:
614         RETVAL
615
616 SV *
617 _utf8_off(sv)
618         SV *    sv
619       CODE:
620         {
621           if (SvPOK(sv)) {
622             SV *rsv = newSViv(SvUTF8(sv));
623             RETVAL = rsv;
624             SvUTF8_off(sv);
625           } else {
626             RETVAL = &PL_sv_undef;
627           }
628         }
629       OUTPUT:
630         RETVAL
631
632 BOOT:
633 {
634 #if defined(USE_PERLIO) && !defined(USE_SFIO)
635 /* PerlIO_define_layer(aTHX_ &PerlIO_encode); */
636 #endif
637 #include "def_t.exh"
638 }