Commit | Line | Data |
---|---|---|
85982a32 | 1 | /* |
41c240f5 | 2 | $Id: Encode.xs,v 2.7 2006/01/15 15:43:36 dankogai Exp dankogai $ |
85982a32 JH |
3 | */ |
4 | ||
fcf2db38 | 5 | #define PERL_NO_GET_CONTEXT |
2c674647 JH |
6 | #include "EXTERN.h" |
7 | #include "perl.h" | |
8 | #include "XSUB.h" | |
2f2b4ff2 NIS |
9 | #define U8 U8 |
10 | #include "encode.h" | |
10c5ecbb JH |
11 | |
12 | # define PERLIO_MODNAME "PerlIO::encoding" | |
0ab8f81e | 13 | # define PERLIO_FILENAME "PerlIO/encoding.pm" |
aae85ceb | 14 | |
85982a32 JH |
15 | /* set 1 or more to profile. t/encoding.t dumps core because of |
16 | Perl_warner and PerlIO don't work well */ | |
c6a7db43 | 17 | #define ENCODE_XS_PROFILE 0 |
39cf9a5e | 18 | |
85982a32 JH |
19 | /* set 0 to disable floating point to calculate buffer size for |
20 | encode_method(). 1 is recommended. 2 restores NI-S original */ | |
c6a7db43 | 21 | #define ENCODE_XS_USEFP 1 |
39cf9a5e | 22 | |
fcf2db38 | 23 | #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ |
2f5768b8 | 24 | Perl_croak(aTHX_ "panic_unimplemented"); \ |
4a83738a | 25 | return (y)0; /* fool picky compilers */ \ |
87714904 | 26 | } |
85982a32 | 27 | /**/ |
011b2d2f | 28 | |
7e9a885a JH |
29 | UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) |
30 | UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) | |
33af2bc7 | 31 | |
7f0d54d7 RGS |
32 | #define UTF8_ALLOW_STRICT 0 |
33 | #define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \ | |
34 | ~(UTF8_ALLOW_CONTINUATION | \ | |
35 | UTF8_ALLOW_NON_CONTINUATION | \ | |
36 | UTF8_ALLOW_LONG)) | |
37 | ||
8e180e82 SP |
38 | static SV* fallback_cb = (SV*)NULL ; |
39 | ||
b0b300a3 | 40 | void |
aa0053b7 | 41 | Encode_XSEncoding(pTHX_ encode_t * enc) |
2f2b4ff2 | 42 | { |
aa0053b7 NIS |
43 | dSP; |
44 | HV *stash = gv_stashpv("Encode::XS", TRUE); | |
45 | SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))), stash); | |
46 | int i = 0; | |
47 | PUSHMARK(sp); | |
48 | XPUSHs(sv); | |
49 | while (enc->name[i]) { | |
50 | const char *name = enc->name[i++]; | |
51 | XPUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); | |
52 | } | |
53 | PUTBACK; | |
54 | call_pv("Encode::define_encoding", G_DISCARD); | |
55 | SvREFCNT_dec(sv); | |
2f2b4ff2 NIS |
56 | } |
57 | ||
aa0053b7 NIS |
58 | void |
59 | call_failure(SV * routine, U8 * done, U8 * dest, U8 * orig) | |
60 | { | |
85982a32 | 61 | /* Exists for breakpointing */ |
aa0053b7 | 62 | } |
67e989fb | 63 | |
85982a32 | 64 | |
2fc614e0 JH |
65 | #define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" |
66 | #define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" | |
67 | ||
2f2b4ff2 | 68 | static SV * |
8e180e82 SP |
69 | do_fallback_cb(pTHX_ UV ch) |
70 | { | |
71 | dSP; | |
72 | int argc; | |
73 | SV* retval; | |
74 | ENTER; | |
75 | SAVETMPS; | |
76 | PUSHMARK(sp); | |
77 | XPUSHs(sv_2mortal(newSVnv((UV)ch))); | |
78 | PUTBACK; | |
79 | argc = call_sv(fallback_cb, G_SCALAR); | |
80 | SPAGAIN; | |
81 | if (argc != 1){ | |
82 | croak("fallback sub must return scalar!"); | |
83 | } | |
84 | retval = newSVsv(POPs); | |
85 | PUTBACK; | |
86 | FREETMPS; | |
87 | LEAVE; | |
88 | return retval; | |
89 | } | |
90 | ||
91 | static SV * | |
0629a5b3 | 92 | encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, |
220e2d4e | 93 | int check, STRLEN * offset, SV * term, int * retcode) |
2f2b4ff2 | 94 | { |
aa0053b7 NIS |
95 | STRLEN slen; |
96 | U8 *s = (U8 *) SvPV(src, slen); | |
3aececda NIS |
97 | STRLEN tlen = slen; |
98 | STRLEN ddone = 0; | |
99 | STRLEN sdone = 0; | |
39cf9a5e | 100 | |
3c49ab08 | 101 | /* We allocate slen+1. |
85982a32 | 102 | PerlIO dumps core if this value is smaller than this. */ |
3c49ab08 | 103 | SV *dst = sv_2mortal(newSV(slen+1)); |
85982a32 JH |
104 | U8 *d = (U8 *)SvPVX(dst); |
105 | STRLEN dlen = SvLEN(dst)-1; | |
220e2d4e IH |
106 | int code = 0; |
107 | STRLEN trmlen = 0; | |
cc7dbc11 | 108 | U8 *trm = term ? (U8*) SvPV(term, trmlen) : NULL; |
220e2d4e IH |
109 | |
110 | if (offset) { | |
111 | s += *offset; | |
6be7c101 JH |
112 | if (slen > *offset){ /* safeguard against slen overflow */ |
113 | slen -= *offset; | |
114 | }else{ | |
115 | slen = 0; | |
116 | } | |
220e2d4e IH |
117 | tlen = slen; |
118 | } | |
85982a32 | 119 | |
6be7c101 | 120 | if (slen == 0){ |
85982a32 JH |
121 | SvCUR_set(dst, 0); |
122 | SvPOK_only(dst); | |
123 | goto ENCODE_END; | |
124 | } | |
125 | ||
220e2d4e IH |
126 | while( (code = do_encode(dir, s, &slen, d, dlen, &dlen, !check, |
127 | trm, trmlen)) ) | |
85982a32 JH |
128 | { |
129 | SvCUR_set(dst, dlen+ddone); | |
130 | SvPOK_only(dst); | |
0b3236bb | 131 | |
220e2d4e IH |
132 | if (code == ENCODE_FALLBACK || code == ENCODE_PARTIAL || |
133 | code == ENCODE_FOUND_TERM) { | |
85982a32 JH |
134 | break; |
135 | } | |
136 | switch (code) { | |
137 | case ENCODE_NOSPACE: | |
138 | { | |
139 | STRLEN more = 0; /* make sure you initialize! */ | |
140 | STRLEN sleft; | |
141 | sdone += slen; | |
142 | ddone += dlen; | |
143 | sleft = tlen - sdone; | |
fcb875d4 | 144 | #if ENCODE_XS_PROFILE >= 2 |
85982a32 JH |
145 | Perl_warn(aTHX_ |
146 | "more=%d, sdone=%d, sleft=%d, SvLEN(dst)=%d\n", | |
147 | more, sdone, sleft, SvLEN(dst)); | |
fcb875d4 | 148 | #endif |
85982a32 | 149 | if (sdone != 0) { /* has src ever been processed ? */ |
39cf9a5e | 150 | #if ENCODE_XS_USEFP == 2 |
85982a32 JH |
151 | more = (1.0*tlen*SvLEN(dst)+sdone-1)/sdone |
152 | - SvLEN(dst); | |
39cf9a5e | 153 | #elif ENCODE_XS_USEFP |
6e21dc91 | 154 | more = (STRLEN)((1.0*SvLEN(dst)+1)/sdone * sleft); |
39cf9a5e | 155 | #else |
85982a32 JH |
156 | /* safe until SvLEN(dst) == MAX_INT/16 */ |
157 | more = (16*SvLEN(dst)+1)/sdone/16 * sleft; | |
39cf9a5e | 158 | #endif |
39cf9a5e | 159 | } |
85982a32 JH |
160 | more += UTF8_MAXLEN; /* insurance policy */ |
161 | d = (U8 *) SvGROW(dst, SvLEN(dst) + more); | |
162 | /* dst need to grow need MORE bytes! */ | |
163 | if (ddone >= SvLEN(dst)) { | |
164 | Perl_croak(aTHX_ "Destination couldn't be grown."); | |
165 | } | |
166 | dlen = SvLEN(dst)-ddone-1; | |
167 | d += ddone; | |
168 | s += slen; | |
169 | slen = tlen-sdone; | |
170 | continue; | |
171 | } | |
172 | case ENCODE_NOREP: | |
173 | /* encoding */ | |
c6a7db43 | 174 | if (dir == enc->f_utf8) { |
85982a32 JH |
175 | STRLEN clen; |
176 | UV ch = | |
3e952a88 | 177 | utf8n_to_uvuni(s+slen, (SvCUR(src)-slen), |
b0b300a3 | 178 | &clen, UTF8_ALLOW_ANY|UTF8_CHECK_ONLY); |
56ff7374 RGS |
179 | /* if non-representable multibyte prefix at end of current buffer - break*/ |
180 | if (clen > tlen - sdone) break; | |
85982a32 | 181 | if (check & ENCODE_DIE_ON_ERR) { |
2fc614e0 JH |
182 | Perl_croak(aTHX_ ERR_ENCODE_NOMAP, |
183 | (UV)ch, enc->name[0]); | |
4089adc4 JH |
184 | return &PL_sv_undef; /* never reaches but be safe */ |
185 | } | |
186 | if (check & ENCODE_WARN_ON_ERR){ | |
187 | Perl_warner(aTHX_ packWARN(WARN_UTF8), | |
2fc614e0 | 188 | ERR_ENCODE_NOMAP, (UV)ch, enc->name[0]); |
4089adc4 JH |
189 | } |
190 | if (check & ENCODE_RETURN_ON_ERR){ | |
191 | goto ENCODE_SET_SRC; | |
192 | } | |
f9d05ba3 RGS |
193 | if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ |
194 | SV* subchar = | |
8e180e82 | 195 | (fallback_cb != (SV*)NULL) ? do_fallback_cb(aTHX_ ch) : |
f9d05ba3 RGS |
196 | newSVpvf(check & ENCODE_PERLQQ ? "\\x{%04"UVxf"}" : |
197 | check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : | |
198 | "&#x%" UVxf ";", (UV)ch); | |
4089adc4 | 199 | sdone += slen + clen; |
f9d05ba3 RGS |
200 | ddone += dlen + SvCUR(subchar); |
201 | sv_catsv(dst, subchar); | |
202 | SvREFCNT_dec(subchar); | |
4089adc4 JH |
203 | } else { |
204 | /* fallback char */ | |
205 | sdone += slen + clen; | |
206 | ddone += dlen + enc->replen; | |
207 | sv_catpvn(dst, (char*)enc->rep, enc->replen); | |
c6a7db43 | 208 | } |
b2704119 | 209 | } |
85982a32 | 210 | /* decoding */ |
c6a7db43 | 211 | else { |
85982a32 | 212 | if (check & ENCODE_DIE_ON_ERR){ |
2fc614e0 | 213 | Perl_croak(aTHX_ ERR_DECODE_NOMAP, |
436c6dd3 | 214 | enc->name[0], (UV)s[slen]); |
4089adc4 JH |
215 | return &PL_sv_undef; /* never reaches but be safe */ |
216 | } | |
217 | if (check & ENCODE_WARN_ON_ERR){ | |
218 | Perl_warner( | |
219 | aTHX_ packWARN(WARN_UTF8), | |
2fc614e0 | 220 | ERR_DECODE_NOMAP, |
436c6dd3 | 221 | enc->name[0], (UV)s[slen]); |
4089adc4 JH |
222 | } |
223 | if (check & ENCODE_RETURN_ON_ERR){ | |
224 | goto ENCODE_SET_SRC; | |
225 | } | |
226 | if (check & | |
227 | (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ | |
8e180e82 SP |
228 | SV* subchar = |
229 | (fallback_cb != (SV*)NULL) ? | |
230 | do_fallback_cb(aTHX_ (UV)s[slen]) : | |
231 | newSVpvf("\\x%02" UVXf, (UV)s[slen]); | |
4089adc4 | 232 | sdone += slen + 1; |
f9d05ba3 RGS |
233 | ddone += dlen + SvCUR(subchar); |
234 | sv_catsv(dst, subchar); | |
235 | SvREFCNT_dec(subchar); | |
4089adc4 JH |
236 | } else { |
237 | sdone += slen + 1; | |
238 | ddone += dlen + strlen(FBCHAR_UTF8); | |
239 | sv_catpv(dst, FBCHAR_UTF8); | |
aa0053b7 | 240 | } |
b2704119 | 241 | } |
85982a32 | 242 | /* settle variables when fallback */ |
b0b300a3 JH |
243 | d = (U8 *)SvEND(dst); |
244 | dlen = SvLEN(dst) - ddone - 1; | |
3e952a88 | 245 | s = (U8*)SvPVX(src) + sdone; |
b2704119 JH |
246 | slen = tlen - sdone; |
247 | break; | |
2f2b4ff2 | 248 | |
85982a32 JH |
249 | default: |
250 | Perl_croak(aTHX_ "Unexpected code %d converting %s %s", | |
251 | code, (dir == enc->f_utf8) ? "to" : "from", | |
252 | enc->name[0]); | |
253 | return &PL_sv_undef; | |
aa0053b7 | 254 | } |
85982a32 JH |
255 | } |
256 | ENCODE_SET_SRC: | |
ca777f1c NIS |
257 | if (check && !(check & ENCODE_LEAVE_SRC)){ |
258 | sdone = SvCUR(src) - (slen+sdone); | |
85982a32 JH |
259 | if (sdone) { |
260 | sv_setpvn(src, (char*)s+slen, sdone); | |
aa0053b7 | 261 | } |
85982a32 | 262 | SvCUR_set(src, sdone); |
2f2b4ff2 | 263 | } |
85982a32 | 264 | /* warn("check = 0x%X, code = 0x%d\n", check, code); */ |
c6a7db43 | 265 | |
85982a32 JH |
266 | SvCUR_set(dst, dlen+ddone); |
267 | SvPOK_only(dst); | |
c6a7db43 | 268 | |
39cf9a5e DK |
269 | #if ENCODE_XS_PROFILE |
270 | if (SvCUR(dst) > SvCUR(src)){ | |
85982a32 JH |
271 | Perl_warn(aTHX_ |
272 | "SvLEN(dst)=%d, SvCUR(dst)=%d. %d bytes unused(%f %%)\n", | |
273 | SvLEN(dst), SvCUR(dst), SvLEN(dst) - SvCUR(dst), | |
274 | (SvLEN(dst) - SvCUR(dst))*1.0/SvLEN(dst)*100.0); | |
39cf9a5e | 275 | } |
3c49ab08 | 276 | #endif |
c6a7db43 | 277 | |
220e2d4e IH |
278 | if (offset) |
279 | *offset += sdone + slen; | |
280 | ||
85982a32 | 281 | ENCODE_END: |
0b3236bb | 282 | *SvEND(dst) = '\0'; |
220e2d4e | 283 | if (retcode) *retcode = code; |
aa0053b7 | 284 | return dst; |
2f2b4ff2 NIS |
285 | } |
286 | ||
7f0d54d7 RGS |
287 | static bool |
288 | strict_utf8(pTHX_ SV* sv) | |
289 | { | |
290 | HV* hv; | |
291 | SV** svp; | |
292 | sv = SvRV(sv); | |
293 | if (!sv || SvTYPE(sv) != SVt_PVHV) | |
294 | return 0; | |
295 | hv = (HV*)sv; | |
296 | svp = hv_fetch(hv, "strict_utf8", 11, 0); | |
297 | if (!svp) | |
298 | return 0; | |
299 | return SvTRUE(*svp); | |
300 | } | |
301 | ||
302 | static U8* | |
303 | process_utf8(pTHX_ SV* dst, U8* s, U8* e, int check, | |
304 | bool encode, bool strict, bool stop_at_partial) | |
305 | { | |
306 | UV uv; | |
307 | STRLEN ulen; | |
308 | ||
309 | SvPOK_only(dst); | |
310 | SvCUR_set(dst,0); | |
311 | ||
312 | while (s < e) { | |
313 | if (UTF8_IS_INVARIANT(*s)) { | |
314 | sv_catpvn(dst, (char *)s, 1); | |
315 | s++; | |
316 | continue; | |
317 | } | |
318 | ||
319 | if (UTF8_IS_START(*s)) { | |
320 | U8 skip = UTF8SKIP(s); | |
321 | if ((s + skip) > e) { | |
322 | /* Partial character */ | |
323 | /* XXX could check that rest of bytes are UTF8_IS_CONTINUATION(ch) */ | |
56ff7374 | 324 | if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) |
7f0d54d7 RGS |
325 | break; |
326 | ||
327 | goto malformed_byte; | |
328 | } | |
329 | ||
330 | uv = utf8n_to_uvuni(s, e - s, &ulen, | |
331 | UTF8_CHECK_ONLY | (strict ? UTF8_ALLOW_STRICT : | |
332 | UTF8_ALLOW_NONSTRICT) | |
333 | ); | |
334 | #if 1 /* perl-5.8.6 and older do not check UTF8_ALLOW_LONG */ | |
335 | if (strict && uv > PERL_UNICODE_MAX) | |
336 | ulen = -1; | |
337 | #endif | |
338 | if (ulen == -1) { | |
339 | if (strict) { | |
340 | uv = utf8n_to_uvuni(s, e - s, &ulen, | |
341 | UTF8_CHECK_ONLY | UTF8_ALLOW_NONSTRICT); | |
342 | if (ulen == -1) | |
343 | goto malformed_byte; | |
344 | goto malformed; | |
345 | } | |
346 | goto malformed_byte; | |
347 | } | |
348 | ||
349 | ||
350 | /* Whole char is good */ | |
351 | sv_catpvn(dst,(char *)s,skip); | |
352 | s += skip; | |
353 | continue; | |
354 | } | |
355 | ||
356 | /* If we get here there is something wrong with alleged UTF-8 */ | |
357 | malformed_byte: | |
358 | uv = (UV)*s; | |
359 | ulen = 1; | |
360 | ||
361 | malformed: | |
362 | if (check & ENCODE_DIE_ON_ERR){ | |
363 | if (encode) | |
364 | Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8"); | |
365 | else | |
366 | Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv); | |
367 | } | |
368 | if (check & ENCODE_WARN_ON_ERR){ | |
369 | if (encode) | |
370 | Perl_warner(aTHX_ packWARN(WARN_UTF8), | |
371 | ERR_ENCODE_NOMAP, uv, "utf8"); | |
372 | else | |
373 | Perl_warner(aTHX_ packWARN(WARN_UTF8), | |
374 | ERR_DECODE_NOMAP, "utf8", uv); | |
375 | } | |
376 | if (check & ENCODE_RETURN_ON_ERR) { | |
377 | break; | |
378 | } | |
379 | if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){ | |
380 | SV* subchar = newSVpvf(check & ENCODE_PERLQQ ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}"): | |
381 | check & ENCODE_HTMLCREF ? "&#%" UVuf ";" : | |
382 | "&#x%" UVxf ";", uv); | |
383 | sv_catsv(dst, subchar); | |
384 | SvREFCNT_dec(subchar); | |
385 | } else { | |
386 | sv_catpv(dst, FBCHAR_UTF8); | |
387 | } | |
388 | s += ulen; | |
389 | } | |
390 | *SvEND(dst) = '\0'; | |
391 | ||
392 | return s; | |
393 | } | |
394 | ||
395 | ||
ab3374e4 DK |
396 | MODULE = Encode PACKAGE = Encode::utf8 PREFIX = Method_ |
397 | ||
a0d8a30e DK |
398 | PROTOTYPES: DISABLE |
399 | ||
400 | void | |
b536bf57 | 401 | Method_decode_xs(obj,src,check = 0) |
ab3374e4 DK |
402 | SV * obj |
403 | SV * src | |
404 | int check | |
405 | CODE: | |
406 | { | |
407 | STRLEN slen; | |
408 | U8 *s = (U8 *) SvPV(src, slen); | |
409 | U8 *e = (U8 *) SvEND(src); | |
b536bf57 | 410 | SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ |
cc836e95 RGS |
411 | |
412 | /* | |
7f0d54d7 | 413 | * PerlIO check -- we assume the object is of PerlIO if renewed |
cc836e95 | 414 | */ |
f5a22bee | 415 | bool renewed = 0; |
cc836e95 RGS |
416 | dSP; ENTER; SAVETMPS; |
417 | PUSHMARK(sp); | |
418 | XPUSHs(obj); | |
419 | PUTBACK; | |
420 | if (call_method("renewed",G_SCALAR) == 1) { | |
421 | SPAGAIN; | |
f5a22bee | 422 | renewed = (bool)POPi; |
cc836e95 RGS |
423 | PUTBACK; |
424 | #if 0 | |
425 | fprintf(stderr, "renewed == %d\n", renewed); | |
426 | #endif | |
cc836e95 RGS |
427 | } |
428 | FREETMPS; LEAVE; | |
429 | /* end PerlIO check */ | |
430 | ||
ab3374e4 DK |
431 | if (SvUTF8(src)) { |
432 | s = utf8_to_bytes(s,&slen); | |
433 | if (s) { | |
434 | SvCUR_set(src,slen); | |
435 | SvUTF8_off(src); | |
436 | e = s+slen; | |
437 | } | |
438 | else { | |
439 | croak("Cannot decode string with wide characters"); | |
440 | } | |
441 | } | |
7f0d54d7 RGS |
442 | |
443 | s = process_utf8(aTHX_ dst, s, e, check, 0, strict_utf8(aTHX_ obj), renewed); | |
ab3374e4 DK |
444 | |
445 | /* Clear out translated part of source unless asked not to */ | |
446 | if (check && !(check & ENCODE_LEAVE_SRC)){ | |
447 | slen = e-s; | |
448 | if (slen) { | |
449 | sv_setpvn(src, (char*)s, slen); | |
450 | } | |
451 | SvCUR_set(src, slen); | |
452 | } | |
453 | SvUTF8_on(dst); | |
454 | ST(0) = sv_2mortal(dst); | |
455 | XSRETURN(1); | |
456 | } | |
457 | ||
458 | void | |
b536bf57 | 459 | Method_encode_xs(obj,src,check = 0) |
ab3374e4 DK |
460 | SV * obj |
461 | SV * src | |
462 | int check | |
463 | CODE: | |
464 | { | |
465 | STRLEN slen; | |
466 | U8 *s = (U8 *) SvPV(src, slen); | |
467 | U8 *e = (U8 *) SvEND(src); | |
b536bf57 | 468 | SV *dst = newSV(slen>0?slen:1); /* newSV() abhors 0 -- inaba */ |
ab3374e4 | 469 | if (SvUTF8(src)) { |
7f0d54d7 RGS |
470 | /* Already encoded */ |
471 | if (strict_utf8(aTHX_ obj)) { | |
472 | s = process_utf8(aTHX_ dst, s, e, check, 1, 1, 0); | |
473 | } | |
474 | else { | |
475 | /* trust it and just copy the octets */ | |
476 | sv_setpvn(dst,(char *)s,(e-s)); | |
477 | s = e; | |
478 | } | |
ab3374e4 DK |
479 | } |
480 | else { | |
481 | /* Native bytes - can always encode */ | |
b536bf57 | 482 | U8 *d = (U8 *) SvGROW(dst, 2*slen+1); /* +1 or assertion will botch */ |
ab3374e4 DK |
483 | while (s < e) { |
484 | UV uv = NATIVE_TO_UNI((UV) *s++); | |
485 | if (UNI_IS_INVARIANT(uv)) | |
486 | *d++ = (U8)UTF_TO_NATIVE(uv); | |
487 | else { | |
488 | *d++ = (U8)UTF8_EIGHT_BIT_HI(uv); | |
489 | *d++ = (U8)UTF8_EIGHT_BIT_LO(uv); | |
490 | } | |
491 | } | |
492 | SvCUR_set(dst, d- (U8 *)SvPVX(dst)); | |
493 | *SvEND(dst) = '\0'; | |
494 | } | |
495 | ||
496 | /* Clear out translated part of source unless asked not to */ | |
497 | if (check && !(check & ENCODE_LEAVE_SRC)){ | |
498 | slen = e-s; | |
499 | if (slen) { | |
500 | sv_setpvn(src, (char*)s, slen); | |
501 | } | |
502 | SvCUR_set(src, slen); | |
503 | } | |
504 | SvPOK_only(dst); | |
505 | SvUTF8_off(dst); | |
506 | ST(0) = sv_2mortal(dst); | |
507 | XSRETURN(1); | |
508 | } | |
509 | ||
50d26985 | 510 | MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ |
2f2b4ff2 NIS |
511 | |
512 | PROTOTYPES: ENABLE | |
513 | ||
514 | void | |
a0d8a30e DK |
515 | Method_renew(obj) |
516 | SV * obj | |
517 | CODE: | |
518 | { | |
519 | XSRETURN(1); | |
520 | } | |
521 | ||
cc836e95 RGS |
522 | int |
523 | Method_renewed(obj) | |
524 | SV * obj | |
525 | CODE: | |
526 | RETVAL = 0; | |
527 | OUTPUT: | |
528 | RETVAL | |
529 | ||
a0d8a30e | 530 | void |
0a95303c NIS |
531 | Method_name(obj) |
532 | SV * obj | |
533 | CODE: | |
85982a32 JH |
534 | { |
535 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); | |
536 | ST(0) = sv_2mortal(newSVpvn(enc->name[0],strlen(enc->name[0]))); | |
537 | XSRETURN(1); | |
538 | } | |
0a95303c NIS |
539 | |
540 | void | |
220e2d4e IH |
541 | Method_cat_decode(obj, dst, src, off, term, check = 0) |
542 | SV * obj | |
543 | SV * dst | |
544 | SV * src | |
545 | SV * off | |
546 | SV * term | |
547 | int check | |
548 | CODE: | |
549 | { | |
550 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); | |
551 | STRLEN offset = (STRLEN)SvIV(off); | |
552 | int code = 0; | |
553 | if (SvUTF8(src)) { | |
554 | sv_utf8_downgrade(src, FALSE); | |
555 | } | |
556 | sv_catsv(dst, encode_method(aTHX_ enc, enc->t_utf8, src, check, | |
557 | &offset, term, &code)); | |
b162af07 | 558 | SvIV_set(off, (IV)offset); |
220e2d4e IH |
559 | if (code == ENCODE_FOUND_TERM) { |
560 | ST(0) = &PL_sv_yes; | |
561 | }else{ | |
562 | ST(0) = &PL_sv_no; | |
563 | } | |
564 | XSRETURN(1); | |
565 | } | |
566 | ||
567 | void | |
8e180e82 | 568 | Method_decode(obj,src,check_sv = &PL_sv_no) |
2f2b4ff2 NIS |
569 | SV * obj |
570 | SV * src | |
8e180e82 | 571 | SV * check_sv |
2f2b4ff2 | 572 | CODE: |
aae85ceb | 573 | { |
8e180e82 | 574 | int check; |
85982a32 | 575 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
ab3374e4 DK |
576 | if (SvUTF8(src)) { |
577 | sv_utf8_downgrade(src, FALSE); | |
578 | } | |
8e180e82 SP |
579 | if (SvROK(check_sv)){ |
580 | if (fallback_cb == (SV*)NULL){ | |
581 | fallback_cb = newSVsv(check_sv); /* First time */ | |
582 | }else{ | |
583 | SvSetSV(fallback_cb, check_sv); /* Been here before */ | |
584 | } | |
585 | check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ | |
586 | }else{ | |
587 | fallback_cb = (SV*)NULL; | |
588 | check = SvIV(check_sv); | |
589 | } | |
220e2d4e IH |
590 | ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check, |
591 | NULL, Nullsv, NULL); | |
85982a32 | 592 | SvUTF8_on(ST(0)); |
aae85ceb DK |
593 | XSRETURN(1); |
594 | } | |
595 | ||
8e180e82 SP |
596 | |
597 | ||
aae85ceb | 598 | void |
8e180e82 | 599 | Method_encode(obj,src,check_sv = &PL_sv_no) |
aae85ceb | 600 | SV * obj |
85982a32 | 601 | SV * src |
8e180e82 | 602 | SV * check_sv |
aae85ceb DK |
603 | CODE: |
604 | { | |
8e180e82 | 605 | int check; |
85982a32 JH |
606 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); |
607 | sv_utf8_upgrade(src); | |
8e180e82 SP |
608 | if (SvROK(check_sv)){ |
609 | if (fallback_cb == (SV*)NULL){ | |
610 | fallback_cb = newSVsv(check_sv); /* First time */ | |
611 | }else{ | |
612 | SvSetSV(fallback_cb, check_sv); /* Been here before */ | |
613 | } | |
614 | check = ENCODE_PERLQQ|ENCODE_LEAVE_SRC; /* same as FB_PERLQQ */ | |
615 | }else{ | |
616 | fallback_cb = (SV*)NULL; | |
617 | check = SvIV(check_sv); | |
618 | } | |
220e2d4e IH |
619 | ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check, |
620 | NULL, Nullsv, NULL); | |
aae85ceb DK |
621 | XSRETURN(1); |
622 | } | |
623 | ||
0ab8f81e JH |
624 | void |
625 | Method_needs_lines(obj) | |
626 | SV * obj | |
627 | CODE: | |
628 | { | |
b32afa7c | 629 | /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ |
0ab8f81e JH |
630 | ST(0) = &PL_sv_no; |
631 | XSRETURN(1); | |
632 | } | |
633 | ||
634 | void | |
635 | Method_perlio_ok(obj) | |
636 | SV * obj | |
637 | CODE: | |
638 | { | |
b32afa7c | 639 | /* encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); */ |
10c5ecbb JH |
640 | /* require_pv(PERLIO_FILENAME); */ |
641 | ||
642 | eval_pv("require PerlIO::encoding", 0); | |
643 | ||
644 | if (SvTRUE(get_sv("@", 0))) { | |
0ab8f81e | 645 | ST(0) = &PL_sv_no; |
10c5ecbb JH |
646 | }else{ |
647 | ST(0) = &PL_sv_yes; | |
0ab8f81e JH |
648 | } |
649 | XSRETURN(1); | |
650 | } | |
651 | ||
67e989fb | 652 | MODULE = Encode PACKAGE = Encode |
2c674647 JH |
653 | |
654 | PROTOTYPES: ENABLE | |
655 | ||
67e989fb | 656 | I32 |
2c674647 | 657 | _bytes_to_utf8(sv, ...) |
85982a32 JH |
658 | SV * sv |
659 | CODE: | |
660 | { | |
661 | SV * encoding = items == 2 ? ST(1) : Nullsv; | |
c6a7db43 | 662 | |
85982a32 JH |
663 | if (encoding) |
664 | RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); | |
665 | else { | |
666 | STRLEN len; | |
667 | U8* s = (U8*)SvPV(sv, len); | |
668 | U8* converted; | |
669 | ||
670 | converted = bytes_to_utf8(s, &len); /* This allocs */ | |
671 | sv_setpvn(sv, (char *)converted, len); | |
672 | SvUTF8_on(sv); /* XXX Should we? */ | |
673 | Safefree(converted); /* ... so free it */ | |
674 | RETVAL = len; | |
675 | } | |
676 | } | |
677 | OUTPUT: | |
678 | RETVAL | |
2c674647 | 679 | |
67e989fb | 680 | I32 |
2c674647 | 681 | _utf8_to_bytes(sv, ...) |
85982a32 JH |
682 | SV * sv |
683 | CODE: | |
684 | { | |
685 | SV * to = items > 1 ? ST(1) : Nullsv; | |
686 | SV * check = items > 2 ? ST(2) : Nullsv; | |
687 | ||
688 | if (to) { | |
689 | RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); | |
690 | } else { | |
691 | STRLEN len; | |
692 | U8 *s = (U8*)SvPV(sv, len); | |
693 | ||
694 | RETVAL = 0; | |
695 | if (SvTRUE(check)) { | |
696 | /* Must do things the slow way */ | |
697 | U8 *dest; | |
698 | /* We need a copy to pass to check() */ | |
c6a7db43 | 699 | U8 *src = (U8*)savepv((char *)s); |
85982a32 JH |
700 | U8 *send = s + len; |
701 | ||
702 | New(83, dest, len, U8); /* I think */ | |
703 | ||
704 | while (s < send) { | |
705 | if (*s < 0x80){ | |
706 | *dest++ = *s++; | |
707 | } else { | |
708 | STRLEN ulen; | |
709 | UV uv = *s++; | |
710 | ||
711 | /* Have to do it all ourselves because of error routine, | |
712 | aargh. */ | |
713 | if (!(uv & 0x40)){ goto failure; } | |
714 | if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } | |
715 | else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } | |
716 | else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } | |
717 | else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } | |
718 | else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } | |
719 | else if (!(uv & 0x01)) { ulen = 7; uv = 0; } | |
720 | else { ulen = 13; uv = 0; } | |
87714904 | 721 | |
85982a32 JH |
722 | /* Note change to utf8.c variable naming, for variety */ |
723 | while (ulen--) { | |
c6a7db43 NIS |
724 | if ((*s & 0xc0) != 0x80){ |
725 | goto failure; | |
85982a32 JH |
726 | } else { |
727 | uv = (uv << 6) | (*s++ & 0x3f); | |
728 | } | |
87714904 | 729 | } |
67e989fb JH |
730 | if (uv > 256) { |
731 | failure: | |
85982a32 JH |
732 | call_failure(check, s, dest, src); |
733 | /* Now what happens? */ | |
67e989fb JH |
734 | } |
735 | *dest++ = (U8)uv; | |
85982a32 JH |
736 | } |
737 | } | |
738 | } else { | |
739 | RETVAL = (utf8_to_bytes(s, &len) ? len : 0); | |
2c674647 | 740 | } |
85982a32 JH |
741 | } |
742 | } | |
743 | OUTPUT: | |
744 | RETVAL | |
2c674647 | 745 | |
2c674647 | 746 | bool |
b2704119 | 747 | is_utf8(sv, check = 0) |
4411f3b6 | 748 | SV * sv |
b2704119 | 749 | int check |
85982a32 JH |
750 | CODE: |
751 | { | |
752 | if (SvGMAGICAL(sv)) /* it could be $1, for example */ | |
753 | sv = newSVsv(sv); /* GMAGIG will be done */ | |
754 | if (SvPOK(sv)) { | |
755 | RETVAL = SvUTF8(sv) ? TRUE : FALSE; | |
756 | if (RETVAL && | |
757 | check && | |
758 | !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) | |
2c674647 | 759 | RETVAL = FALSE; |
85982a32 JH |
760 | } else { |
761 | RETVAL = FALSE; | |
762 | } | |
763 | if (sv != ST(0)) | |
764 | SvREFCNT_dec(sv); /* it was a temp copy */ | |
765 | } | |
766 | OUTPUT: | |
767 | RETVAL | |
2c674647 JH |
768 | |
769 | SV * | |
4411f3b6 | 770 | _utf8_on(sv) |
85982a32 JH |
771 | SV * sv |
772 | CODE: | |
773 | { | |
774 | if (SvPOK(sv)) { | |
775 | SV *rsv = newSViv(SvUTF8(sv)); | |
776 | RETVAL = rsv; | |
777 | SvUTF8_on(sv); | |
778 | } else { | |
779 | RETVAL = &PL_sv_undef; | |
780 | } | |
781 | } | |
782 | OUTPUT: | |
783 | RETVAL | |
2c674647 JH |
784 | |
785 | SV * | |
4411f3b6 | 786 | _utf8_off(sv) |
85982a32 JH |
787 | SV * sv |
788 | CODE: | |
789 | { | |
790 | if (SvPOK(sv)) { | |
791 | SV *rsv = newSViv(SvUTF8(sv)); | |
792 | RETVAL = rsv; | |
793 | SvUTF8_off(sv); | |
794 | } else { | |
795 | RETVAL = &PL_sv_undef; | |
796 | } | |
797 | } | |
798 | OUTPUT: | |
799 | RETVAL | |
800 | ||
85982a32 JH |
801 | int |
802 | DIE_ON_ERR() | |
803 | CODE: | |
804 | RETVAL = ENCODE_DIE_ON_ERR; | |
805 | OUTPUT: | |
806 | RETVAL | |
807 | ||
c6a7db43 | 808 | int |
85982a32 JH |
809 | WARN_ON_ERR() |
810 | CODE: | |
811 | RETVAL = ENCODE_WARN_ON_ERR; | |
812 | OUTPUT: | |
813 | RETVAL | |
814 | ||
815 | int | |
816 | LEAVE_SRC() | |
817 | CODE: | |
818 | RETVAL = ENCODE_LEAVE_SRC; | |
819 | OUTPUT: | |
820 | RETVAL | |
821 | ||
822 | int | |
823 | RETURN_ON_ERR() | |
824 | CODE: | |
825 | RETVAL = ENCODE_RETURN_ON_ERR; | |
826 | OUTPUT: | |
827 | RETVAL | |
828 | ||
829 | int | |
830 | PERLQQ() | |
831 | CODE: | |
832 | RETVAL = ENCODE_PERLQQ; | |
833 | OUTPUT: | |
834 | RETVAL | |
835 | ||
836 | int | |
af1f55d9 JH |
837 | HTMLCREF() |
838 | CODE: | |
839 | RETVAL = ENCODE_HTMLCREF; | |
840 | OUTPUT: | |
841 | RETVAL | |
842 | ||
843 | int | |
844 | XMLCREF() | |
845 | CODE: | |
846 | RETVAL = ENCODE_XMLCREF; | |
847 | OUTPUT: | |
848 | RETVAL | |
849 | ||
850 | int | |
56ff7374 RGS |
851 | STOP_AT_PARTIAL() |
852 | CODE: | |
853 | RETVAL = ENCODE_STOP_AT_PARTIAL; | |
854 | OUTPUT: | |
855 | RETVAL | |
856 | ||
857 | int | |
85982a32 JH |
858 | FB_DEFAULT() |
859 | CODE: | |
860 | RETVAL = ENCODE_FB_DEFAULT; | |
861 | OUTPUT: | |
862 | RETVAL | |
863 | ||
864 | int | |
865 | FB_CROAK() | |
866 | CODE: | |
867 | RETVAL = ENCODE_FB_CROAK; | |
868 | OUTPUT: | |
869 | RETVAL | |
870 | ||
871 | int | |
872 | FB_QUIET() | |
873 | CODE: | |
874 | RETVAL = ENCODE_FB_QUIET; | |
875 | OUTPUT: | |
876 | RETVAL | |
877 | ||
878 | int | |
879 | FB_WARN() | |
880 | CODE: | |
881 | RETVAL = ENCODE_FB_WARN; | |
882 | OUTPUT: | |
883 | RETVAL | |
884 | ||
885 | int | |
886 | FB_PERLQQ() | |
887 | CODE: | |
888 | RETVAL = ENCODE_FB_PERLQQ; | |
889 | OUTPUT: | |
890 | RETVAL | |
2c674647 | 891 | |
af1f55d9 JH |
892 | int |
893 | FB_HTMLCREF() | |
894 | CODE: | |
895 | RETVAL = ENCODE_FB_HTMLCREF; | |
896 | OUTPUT: | |
897 | RETVAL | |
898 | ||
899 | int | |
900 | FB_XMLCREF() | |
901 | CODE: | |
902 | RETVAL = ENCODE_FB_XMLCREF; | |
903 | OUTPUT: | |
904 | RETVAL | |
905 | ||
33af2bc7 NIS |
906 | BOOT: |
907 | { | |
85982a32 | 908 | #include "def_t.h" |
e7cbefb8 | 909 | #include "def_t.exh" |
33af2bc7 | 910 | } |