Commit | Line | Data |
---|---|---|
fcf2db38 NIS |
1 | #define PERL_NO_GET_CONTEXT |
2 | ||
2c674647 JH |
3 | #include "EXTERN.h" |
4 | #include "perl.h" | |
5 | #include "XSUB.h" | |
2f2b4ff2 NIS |
6 | #define U8 U8 |
7 | #include "encode.h" | |
8 | #include "iso8859.h" | |
9 | #include "EBCDIC.h" | |
10 | #include "Symbols.h" | |
2c674647 | 11 | |
fcf2db38 NIS |
12 | |
13 | #define UNIMPLEMENTED(x,y) y x (SV *sv, char *encoding) {dTHX; \ | |
2f5768b8 | 14 | Perl_croak(aTHX_ "panic_unimplemented"); \ |
4a83738a | 15 | return (y)0; /* fool picky compilers */ \ |
87714904 | 16 | } |
67e989fb JH |
17 | UNIMPLEMENTED(_encoded_utf8_to_bytes, I32) |
18 | UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) | |
19 | ||
9df9a5cd | 20 | #if defined(USE_PERLIO) && !defined(USE_SFIO) |
72e44f29 NIS |
21 | /* Define an encoding "layer" in the perliol.h sense. |
22 | The layer defined here "inherits" in an object-oriented sense from the | |
23 | "perlio" layer with its PerlIOBuf_* "methods". | |
24 | The implementation is particularly efficient as until Encode settles down | |
25 | there is no point in tryint to tune it. | |
26 | ||
27 | The layer works by overloading the "fill" and "flush" methods. | |
28 | ||
29 | "fill" calls "SUPER::fill" in perl terms, then calls the encode OO perl API | |
30 | to convert the encoded data to UTF-8 form, then copies it back to the | |
31 | buffer. The "base class's" read methods then see the UTF-8 data. | |
32 | ||
33 | "flush" transforms the UTF-8 data deposited by the "base class's write | |
34 | method in the buffer back into the encoded form using the encode OO perl API, | |
35 | then copies data back into the buffer and calls "SUPER::flush. | |
36 | ||
37 | Note that "flush" is _also_ called for read mode - we still do the (back)-translate | |
38 | so that the the base class's "flush" sees the correct number of encoded chars | |
39 | for positioning the seek pointer. (This double translation is the worst performance | |
40 | issue - particularly with all-perl encode engine.) | |
41 | ||
42 | */ | |
43 | ||
44 | ||
33af2bc7 NIS |
45 | #include "perliol.h" |
46 | ||
47 | typedef struct | |
48 | { | |
49 | PerlIOBuf base; /* PerlIOBuf stuff */ | |
50 | SV * bufsv; | |
51 | SV * enc; | |
52 | } PerlIOEncode; | |
53 | ||
e3f3bf95 NIS |
54 | SV * |
55 | PerlIOEncode_getarg(PerlIO *f) | |
56 | { | |
fcf2db38 | 57 | dTHX; |
e3f3bf95 NIS |
58 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); |
59 | SV *sv = &PL_sv_undef; | |
60 | if (e->enc) | |
61 | { | |
62 | dSP; | |
63 | ENTER; | |
64 | SAVETMPS; | |
65 | PUSHMARK(sp); | |
66 | XPUSHs(e->enc); | |
67 | PUTBACK; | |
68 | if (perl_call_method("name",G_SCALAR) == 1) | |
69 | { | |
70 | SPAGAIN; | |
71 | sv = newSVsv(POPs); | |
72 | PUTBACK; | |
73 | } | |
74 | } | |
75 | return sv; | |
76 | } | |
33af2bc7 NIS |
77 | |
78 | IV | |
e3f3bf95 | 79 | PerlIOEncode_pushed(PerlIO *f, const char *mode, SV *arg) |
33af2bc7 NIS |
80 | { |
81 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); | |
82 | dTHX; | |
83 | dSP; | |
84 | IV code; | |
e3f3bf95 | 85 | code = PerlIOBuf_pushed(f,mode,Nullsv); |
33af2bc7 NIS |
86 | ENTER; |
87 | SAVETMPS; | |
88 | PUSHMARK(sp); | |
e3f3bf95 | 89 | XPUSHs(arg); |
33af2bc7 | 90 | PUTBACK; |
51ef4e11 | 91 | if (perl_call_pv("Encode::find_encoding",G_SCALAR) != 1) |
52744f63 NIS |
92 | { |
93 | /* should never happen */ | |
51ef4e11 | 94 | Perl_die(aTHX_ "Encode::find_encoding did not return a value"); |
52744f63 NIS |
95 | return -1; |
96 | } | |
33af2bc7 NIS |
97 | SPAGAIN; |
98 | e->enc = POPs; | |
99 | PUTBACK; | |
100 | if (!SvROK(e->enc)) | |
52744f63 NIS |
101 | { |
102 | e->enc = Nullsv; | |
103 | errno = EINVAL; | |
29b291f7 | 104 | Perl_warner(aTHX_ WARN_IO, "Cannot find encoding \"%"SVf"\"", arg); |
52744f63 NIS |
105 | return -1; |
106 | } | |
33af2bc7 NIS |
107 | SvREFCNT_inc(e->enc); |
108 | FREETMPS; | |
109 | LEAVE; | |
110 | PerlIOBase(f)->flags |= PERLIO_F_UTF8; | |
111 | return code; | |
112 | } | |
113 | ||
114 | IV | |
115 | PerlIOEncode_popped(PerlIO *f) | |
116 | { | |
117 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); | |
118 | dTHX; | |
119 | if (e->enc) | |
120 | { | |
121 | SvREFCNT_dec(e->enc); | |
122 | e->enc = Nullsv; | |
123 | } | |
124 | if (e->bufsv) | |
125 | { | |
126 | SvREFCNT_dec(e->bufsv); | |
127 | e->bufsv = Nullsv; | |
128 | } | |
129 | return 0; | |
130 | } | |
131 | ||
132 | STDCHAR * | |
133 | PerlIOEncode_get_base(PerlIO *f) | |
134 | { | |
135 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); | |
136 | dTHX; | |
137 | if (!e->base.bufsiz) | |
138 | e->base.bufsiz = 1024; | |
139 | if (!e->bufsv) | |
140 | { | |
141 | e->bufsv = newSV(e->base.bufsiz); | |
142 | sv_setpvn(e->bufsv,"",0); | |
143 | } | |
62e8870c | 144 | e->base.buf = (STDCHAR *)SvPVX(e->bufsv); |
33af2bc7 NIS |
145 | if (!e->base.ptr) |
146 | e->base.ptr = e->base.buf; | |
147 | if (!e->base.end) | |
148 | e->base.end = e->base.buf; | |
149 | if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv)) | |
150 | { | |
151 | Perl_warn(aTHX_ " ptr %p(%p)%p", | |
152 | e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv)); | |
153 | abort(); | |
154 | } | |
155 | if (SvLEN(e->bufsv) < e->base.bufsiz) | |
156 | { | |
157 | SSize_t poff = e->base.ptr - e->base.buf; | |
158 | SSize_t eoff = e->base.end - e->base.buf; | |
62e8870c | 159 | e->base.buf = (STDCHAR *)SvGROW(e->bufsv,e->base.bufsiz); |
33af2bc7 NIS |
160 | e->base.ptr = e->base.buf + poff; |
161 | e->base.end = e->base.buf + eoff; | |
162 | } | |
163 | if (e->base.ptr < e->base.buf || e->base.ptr > e->base.buf+SvLEN(e->bufsv)) | |
164 | { | |
165 | Perl_warn(aTHX_ " ptr %p(%p)%p", | |
166 | e->base.buf,e->base.ptr,e->base.buf+SvLEN(e->bufsv)); | |
167 | abort(); | |
168 | } | |
169 | return e->base.buf; | |
170 | } | |
171 | ||
33af2bc7 NIS |
172 | IV |
173 | PerlIOEncode_fill(PerlIO *f) | |
174 | { | |
175 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); | |
176 | dTHX; | |
177 | dSP; | |
178 | IV code; | |
33af2bc7 NIS |
179 | code = PerlIOBuf_fill(f); |
180 | if (code == 0) | |
181 | { | |
182 | SV *uni; | |
72e44f29 NIS |
183 | STRLEN len; |
184 | char *s; | |
185 | /* Set SV that is the buffer to be buf..ptr */ | |
33af2bc7 NIS |
186 | SvCUR_set(e->bufsv, e->base.end - e->base.buf); |
187 | SvUTF8_off(e->bufsv); | |
188 | ENTER; | |
189 | SAVETMPS; | |
190 | PUSHMARK(sp); | |
191 | XPUSHs(e->enc); | |
192 | XPUSHs(e->bufsv); | |
193 | XPUSHs(&PL_sv_yes); | |
194 | PUTBACK; | |
50d26985 | 195 | if (perl_call_method("decode",G_SCALAR) != 1) |
33af2bc7 NIS |
196 | code = -1; |
197 | SPAGAIN; | |
198 | uni = POPs; | |
199 | PUTBACK; | |
72e44f29 NIS |
200 | /* Now get translated string (forced to UTF-8) and copy back to buffer |
201 | don't use sv_setsv as that may "steal" PV from returned temp | |
202 | and so free() our known-large-enough buffer. | |
203 | sv_setpvn() should do but let us do it long hand. | |
204 | */ | |
205 | s = SvPVutf8(uni,len); | |
206 | if (s != SvPVX(e->bufsv)) | |
207 | { | |
62e8870c | 208 | e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len); |
72e44f29 NIS |
209 | Move(s,e->base.buf,len,char); |
210 | SvCUR_set(e->bufsv,len); | |
211 | } | |
212 | SvUTF8_on(e->bufsv); | |
213 | e->base.end = e->base.buf+len; | |
33af2bc7 NIS |
214 | e->base.ptr = e->base.buf; |
215 | FREETMPS; | |
216 | LEAVE; | |
217 | } | |
218 | return code; | |
219 | } | |
220 | ||
221 | IV | |
222 | PerlIOEncode_flush(PerlIO *f) | |
223 | { | |
224 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); | |
225 | IV code = 0; | |
8040349a NIS |
226 | if (e->bufsv && (PerlIOBase(f)->flags & (PERLIO_F_RDBUF|PERLIO_F_WRBUF)) |
227 | &&(e->base.ptr > e->base.buf) | |
228 | ) | |
33af2bc7 | 229 | { |
8040349a | 230 | dTHX; |
33af2bc7 NIS |
231 | dSP; |
232 | SV *str; | |
233 | char *s; | |
234 | STRLEN len; | |
72e44f29 NIS |
235 | SSize_t left = 0; |
236 | if (PerlIOBase(f)->flags & PERLIO_F_RDBUF) | |
237 | { | |
238 | /* This is really just a flag to see if we took all the data, if | |
239 | we did PerlIOBase_flush avoids a seek to lower layer. | |
240 | Need to revisit if we start getting clever with unreads or seeks-in-buffer | |
241 | */ | |
242 | left = e->base.end - e->base.ptr; | |
243 | } | |
33af2bc7 NIS |
244 | ENTER; |
245 | SAVETMPS; | |
246 | PUSHMARK(sp); | |
247 | XPUSHs(e->enc); | |
72e44f29 | 248 | SvCUR_set(e->bufsv, e->base.ptr - e->base.buf); |
33af2bc7 NIS |
249 | SvUTF8_on(e->bufsv); |
250 | XPUSHs(e->bufsv); | |
251 | XPUSHs(&PL_sv_yes); | |
252 | PUTBACK; | |
50d26985 | 253 | if (perl_call_method("encode",G_SCALAR) != 1) |
33af2bc7 NIS |
254 | code = -1; |
255 | SPAGAIN; | |
256 | str = POPs; | |
257 | PUTBACK; | |
72e44f29 NIS |
258 | s = SvPV(str,len); |
259 | if (s != SvPVX(e->bufsv)) | |
260 | { | |
62e8870c | 261 | e->base.buf = (STDCHAR *)SvGROW(e->bufsv,len); |
72e44f29 NIS |
262 | Move(s,e->base.buf,len,char); |
263 | SvCUR_set(e->bufsv,len); | |
264 | } | |
33af2bc7 | 265 | SvUTF8_off(e->bufsv); |
72e44f29 NIS |
266 | e->base.ptr = e->base.buf+len; |
267 | /* restore end != ptr as inequality is used by PerlIOBuf_flush in read case */ | |
268 | e->base.end = e->base.ptr + left; | |
33af2bc7 NIS |
269 | FREETMPS; |
270 | LEAVE; | |
271 | if (PerlIOBuf_flush(f) != 0) | |
272 | code = -1; | |
273 | } | |
274 | return code; | |
275 | } | |
276 | ||
277 | IV | |
278 | PerlIOEncode_close(PerlIO *f) | |
279 | { | |
280 | PerlIOEncode *e = PerlIOSelf(f,PerlIOEncode); | |
281 | IV code = PerlIOBase_close(f); | |
282 | dTHX; | |
283 | if (e->bufsv) | |
284 | { | |
285 | SvREFCNT_dec(e->bufsv); | |
286 | e->bufsv = Nullsv; | |
287 | } | |
288 | e->base.buf = NULL; | |
289 | e->base.ptr = NULL; | |
290 | e->base.end = NULL; | |
291 | PerlIOBase(f)->flags &= ~(PERLIO_F_RDBUF|PERLIO_F_WRBUF); | |
292 | return code; | |
293 | } | |
294 | ||
72e44f29 NIS |
295 | Off_t |
296 | PerlIOEncode_tell(PerlIO *f) | |
297 | { | |
a999f61b | 298 | dTHX; |
72e44f29 NIS |
299 | PerlIOBuf *b = PerlIOSelf(f,PerlIOBuf); |
300 | /* Unfortunately the only way to get a postion is to back-translate, | |
301 | the UTF8-bytes we have buf..ptr and adjust accordingly. | |
302 | But we will try and save any unread data in case stream | |
303 | is un-seekable. | |
304 | */ | |
305 | if ((PerlIOBase(f)->flags & PERLIO_F_RDBUF) && b->ptr < b->end) | |
306 | { | |
307 | Size_t count = b->end - b->ptr; | |
e3f3bf95 | 308 | PerlIO_push(aTHX_ f,&PerlIO_pending,"r",Nullsv); |
72e44f29 NIS |
309 | /* Save what we have left to read */ |
310 | PerlIOSelf(f,PerlIOBuf)->bufsiz = count; | |
311 | PerlIO_unread(f,b->ptr,count); | |
312 | /* There isn't any unread data - we just saved it - so avoid the lower seek */ | |
313 | b->end = b->ptr; | |
314 | /* Flush ourselves - now one layer down, | |
315 | this does the back translate and adjusts position | |
316 | */ | |
317 | PerlIO_flush(PerlIONext(f)); | |
318 | /* Set position of the saved data */ | |
319 | PerlIOSelf(f,PerlIOBuf)->posn = b->posn; | |
320 | } | |
321 | else | |
322 | { | |
323 | PerlIO_flush(f); | |
324 | } | |
325 | return b->posn; | |
326 | } | |
327 | ||
33af2bc7 | 328 | PerlIO_funcs PerlIO_encode = { |
72e44f29 | 329 | "encoding", |
33af2bc7 NIS |
330 | sizeof(PerlIOEncode), |
331 | PERLIO_K_BUFFERED, | |
33af2bc7 NIS |
332 | PerlIOEncode_pushed, |
333 | PerlIOEncode_popped, | |
e3f3bf95 NIS |
334 | PerlIOBuf_open, |
335 | PerlIOEncode_getarg, | |
336 | PerlIOBase_fileno, | |
33af2bc7 NIS |
337 | PerlIOBuf_read, |
338 | PerlIOBuf_unread, | |
339 | PerlIOBuf_write, | |
340 | PerlIOBuf_seek, | |
72e44f29 | 341 | PerlIOEncode_tell, |
33af2bc7 NIS |
342 | PerlIOEncode_close, |
343 | PerlIOEncode_flush, | |
344 | PerlIOEncode_fill, | |
345 | PerlIOBase_eof, | |
346 | PerlIOBase_error, | |
347 | PerlIOBase_clearerr, | |
f6c77cf1 | 348 | PerlIOBase_setlinebuf, |
33af2bc7 NIS |
349 | PerlIOEncode_get_base, |
350 | PerlIOBuf_bufsiz, | |
351 | PerlIOBuf_get_ptr, | |
352 | PerlIOBuf_get_cnt, | |
353 | PerlIOBuf_set_ptrcnt, | |
354 | }; | |
9df9a5cd | 355 | #endif /* encode layer */ |
33af2bc7 | 356 | |
2f2b4ff2 NIS |
357 | void |
358 | Encode_Define(pTHX_ encode_t *enc) | |
359 | { | |
51ef4e11 | 360 | dSP; |
2f2b4ff2 NIS |
361 | HV *stash = gv_stashpv("Encode::XS", TRUE); |
362 | SV *sv = sv_bless(newRV_noinc(newSViv(PTR2IV(enc))),stash); | |
dcda1f94 | 363 | int i = 0; |
51ef4e11 NIS |
364 | PUSHMARK(sp); |
365 | XPUSHs(sv); | |
dcda1f94 NIS |
366 | while (enc->name[i]) |
367 | { | |
368 | const char *name = enc->name[i++]; | |
51ef4e11 | 369 | XPUSHs(sv_2mortal(newSVpvn(name,strlen(name)))); |
dcda1f94 | 370 | } |
51ef4e11 NIS |
371 | PUTBACK; |
372 | call_pv("Encode::define_encoding",G_DISCARD); | |
dcda1f94 | 373 | SvREFCNT_dec(sv); |
2f2b4ff2 NIS |
374 | } |
375 | ||
183a2d84 | 376 | void call_failure (SV *routine, U8* done, U8* dest, U8* orig) {} |
67e989fb | 377 | |
2f2b4ff2 NIS |
378 | static SV * |
379 | encode_method(pTHX_ encode_t *enc, encpage_t *dir, SV *src, int check) | |
380 | { | |
381 | STRLEN slen; | |
382 | U8 *s = (U8 *) SvPV(src,slen); | |
383 | SV *dst = sv_2mortal(newSV(2*slen+1)); | |
384 | if (slen) | |
385 | { | |
386 | U8 *d = (U8 *) SvGROW(dst, 2*slen+1); | |
387 | STRLEN dlen = SvLEN(dst); | |
388 | int code; | |
9b37254d | 389 | while ((code = do_encode(dir,s,&slen,d,dlen,&dlen,!check))) |
2f2b4ff2 NIS |
390 | { |
391 | SvCUR_set(dst,dlen); | |
392 | SvPOK_on(dst); | |
9b37254d NIS |
393 | |
394 | if (code == ENCODE_FALLBACK) | |
395 | break; | |
396 | ||
2f2b4ff2 NIS |
397 | switch(code) |
398 | { | |
399 | case ENCODE_NOSPACE: | |
400 | { | |
401 | STRLEN need = (slen) ? (SvLEN(dst)*SvCUR(src)/slen) : (dlen + UTF8_MAXLEN); | |
402 | if (need <= SvLEN(dst)) | |
403 | need += UTF8_MAXLEN; | |
404 | d = (U8 *) SvGROW(dst, need); | |
405 | dlen = SvLEN(dst); | |
406 | slen = SvCUR(src); | |
407 | break; | |
408 | } | |
409 | ||
410 | case ENCODE_NOREP: | |
411 | if (dir == enc->f_utf8) | |
412 | { | |
413 | if (!check && ckWARN_d(WARN_UTF8)) | |
414 | { | |
415 | STRLEN clen; | |
9041c2e3 NIS |
416 | UV ch = utf8n_to_uvuni(s+slen,(SvCUR(src)-slen),&clen,0); |
417 | Perl_warner(aTHX_ WARN_UTF8, "\"\\N{U+%"UVxf"}\" does not map to %s", ch, enc->name[0]); | |
2f2b4ff2 NIS |
418 | /* FIXME: Skip over the character, copy in replacement and continue |
419 | * but that is messy so for now just fail. | |
420 | */ | |
421 | return &PL_sv_undef; | |
422 | } | |
423 | else | |
424 | { | |
425 | return &PL_sv_undef; | |
426 | } | |
427 | } | |
428 | else | |
429 | { | |
430 | /* UTF-8 is supposed to be "Universal" so should not happen */ | |
431 | Perl_croak(aTHX_ "%s '%.*s' does not map to UTF-8", | |
dcda1f94 | 432 | enc->name[0], (int)(SvCUR(src)-slen),s+slen); |
2f2b4ff2 NIS |
433 | } |
434 | break; | |
435 | ||
436 | case ENCODE_PARTIAL: | |
437 | if (!check && ckWARN_d(WARN_UTF8)) | |
438 | { | |
439 | Perl_warner(aTHX_ WARN_UTF8, "Partial %s character", | |
dcda1f94 | 440 | (dir == enc->f_utf8) ? "UTF-8" : enc->name[0]); |
2f2b4ff2 NIS |
441 | } |
442 | return &PL_sv_undef; | |
443 | ||
444 | default: | |
445 | Perl_croak(aTHX_ "Unexpected code %d converting %s %s", | |
dcda1f94 | 446 | code, (dir == enc->f_utf8) ? "to" : "from",enc->name[0]); |
2f2b4ff2 NIS |
447 | return &PL_sv_undef; |
448 | } | |
449 | } | |
450 | SvCUR_set(dst,dlen); | |
451 | SvPOK_on(dst); | |
452 | if (check) | |
453 | { | |
454 | if (slen < SvCUR(src)) | |
455 | { | |
456 | Move(s+slen,s,SvCUR(src)-slen,U8); | |
457 | } | |
458 | SvCUR_set(src,SvCUR(src)-slen); | |
459 | } | |
460 | } | |
8040349a NIS |
461 | else |
462 | { | |
463 | SvCUR_set(dst,slen); | |
464 | SvPOK_on(dst); | |
465 | } | |
2f2b4ff2 NIS |
466 | return dst; |
467 | } | |
468 | ||
50d26985 | 469 | MODULE = Encode PACKAGE = Encode::XS PREFIX = Method_ |
2f2b4ff2 NIS |
470 | |
471 | PROTOTYPES: ENABLE | |
472 | ||
473 | void | |
691638dd | 474 | Method_decode(obj,src,check = FALSE) |
2f2b4ff2 NIS |
475 | SV * obj |
476 | SV * src | |
691638dd | 477 | bool check |
2f2b4ff2 NIS |
478 | CODE: |
479 | { | |
480 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); | |
481 | ST(0) = encode_method(aTHX_ enc, enc->t_utf8, src, check); | |
482 | SvUTF8_on(ST(0)); | |
483 | XSRETURN(1); | |
484 | } | |
485 | ||
486 | void | |
691638dd | 487 | Method_encode(obj,src,check = FALSE) |
2f2b4ff2 NIS |
488 | SV * obj |
489 | SV * src | |
691638dd | 490 | bool check |
2f2b4ff2 NIS |
491 | CODE: |
492 | { | |
493 | encode_t *enc = INT2PTR(encode_t *, SvIV(SvRV(obj))); | |
494 | sv_utf8_upgrade(src); | |
495 | ST(0) = encode_method(aTHX_ enc, enc->f_utf8, src, check); | |
496 | XSRETURN(1); | |
497 | } | |
498 | ||
67e989fb | 499 | MODULE = Encode PACKAGE = Encode |
2c674647 JH |
500 | |
501 | PROTOTYPES: ENABLE | |
502 | ||
67e989fb | 503 | I32 |
2c674647 | 504 | _bytes_to_utf8(sv, ...) |
67e989fb | 505 | SV * sv |
2c674647 | 506 | CODE: |
67e989fb JH |
507 | { |
508 | SV * encoding = items == 2 ? ST(1) : Nullsv; | |
509 | ||
510 | if (encoding) | |
511 | RETVAL = _encoded_bytes_to_utf8(sv, SvPV_nolen(encoding)); | |
512 | else { | |
513 | STRLEN len; | |
183a2d84 | 514 | U8* s = (U8*)SvPV(sv, len); |
67e989fb JH |
515 | U8* converted; |
516 | ||
517 | converted = bytes_to_utf8(s, &len); /* This allocs */ | |
183a2d84 | 518 | sv_setpvn(sv, (char *)converted, len); |
67e989fb JH |
519 | SvUTF8_on(sv); /* XXX Should we? */ |
520 | Safefree(converted); /* ... so free it */ | |
521 | RETVAL = len; | |
522 | } | |
523 | } | |
2c674647 | 524 | OUTPUT: |
67e989fb | 525 | RETVAL |
2c674647 | 526 | |
67e989fb | 527 | I32 |
2c674647 | 528 | _utf8_to_bytes(sv, ...) |
67e989fb | 529 | SV * sv |
2c674647 | 530 | CODE: |
67e989fb JH |
531 | { |
532 | SV * to = items > 1 ? ST(1) : Nullsv; | |
533 | SV * check = items > 2 ? ST(2) : Nullsv; | |
87714904 | 534 | |
67e989fb JH |
535 | if (to) |
536 | RETVAL = _encoded_utf8_to_bytes(sv, SvPV_nolen(to)); | |
537 | else { | |
67e989fb | 538 | STRLEN len; |
b113ac0e | 539 | U8 *s = (U8*)SvPV(sv, len); |
67e989fb | 540 | |
9c5ffd7c | 541 | RETVAL = 0; |
67e989fb JH |
542 | if (SvTRUE(check)) { |
543 | /* Must do things the slow way */ | |
544 | U8 *dest; | |
87714904 | 545 | U8 *src = (U8*)savepv((char *)s); /* We need a copy to pass to check() */ |
67e989fb JH |
546 | U8 *send = s + len; |
547 | ||
548 | New(83, dest, len, U8); /* I think */ | |
549 | ||
550 | while (s < send) { | |
551 | if (*s < 0x80) | |
552 | *dest++ = *s++; | |
553 | else { | |
b113ac0e JH |
554 | STRLEN ulen; |
555 | UV uv = *s++; | |
87714904 | 556 | |
67e989fb JH |
557 | /* Have to do it all ourselves because of error routine, |
558 | aargh. */ | |
559 | if (!(uv & 0x40)) | |
560 | goto failure; | |
561 | if (!(uv & 0x20)) { ulen = 2; uv &= 0x1f; } | |
562 | else if (!(uv & 0x10)) { ulen = 3; uv &= 0x0f; } | |
563 | else if (!(uv & 0x08)) { ulen = 4; uv &= 0x07; } | |
564 | else if (!(uv & 0x04)) { ulen = 5; uv &= 0x03; } | |
565 | else if (!(uv & 0x02)) { ulen = 6; uv &= 0x01; } | |
566 | else if (!(uv & 0x01)) { ulen = 7; uv = 0; } | |
567 | else { ulen = 13; uv = 0; } | |
87714904 | 568 | |
67e989fb JH |
569 | /* Note change to utf8.c variable naming, for variety */ |
570 | while (ulen--) { | |
571 | if ((*s & 0xc0) != 0x80) | |
572 | goto failure; | |
87714904 | 573 | |
67e989fb JH |
574 | else |
575 | uv = (uv << 6) | (*s++ & 0x3f); | |
87714904 | 576 | } |
67e989fb JH |
577 | if (uv > 256) { |
578 | failure: | |
579 | call_failure(check, s, dest, src); | |
580 | /* Now what happens? */ | |
581 | } | |
582 | *dest++ = (U8)uv; | |
583 | } | |
584 | } | |
585 | } else | |
586 | RETVAL = (utf8_to_bytes(s, &len) ? len : 0); | |
587 | } | |
2c674647 JH |
588 | } |
589 | OUTPUT: | |
590 | RETVAL | |
591 | ||
2c674647 | 592 | bool |
4411f3b6 NIS |
593 | is_utf8(sv, check = FALSE) |
594 | SV * sv | |
595 | bool check | |
2c674647 JH |
596 | CODE: |
597 | { | |
2c674647 | 598 | if (SvPOK(sv)) { |
4411f3b6 | 599 | RETVAL = SvUTF8(sv) ? TRUE : FALSE; |
2c674647 | 600 | if (RETVAL && |
4411f3b6 | 601 | check && |
2c674647 JH |
602 | !is_utf8_string((U8*)SvPVX(sv), SvCUR(sv))) |
603 | RETVAL = FALSE; | |
604 | } else { | |
605 | RETVAL = FALSE; | |
606 | } | |
607 | } | |
608 | OUTPUT: | |
609 | RETVAL | |
610 | ||
611 | SV * | |
4411f3b6 | 612 | _utf8_on(sv) |
2c674647 JH |
613 | SV * sv |
614 | CODE: | |
615 | { | |
616 | if (SvPOK(sv)) { | |
87714904 | 617 | SV *rsv = newSViv(SvUTF8(sv)); |
2c674647 JH |
618 | RETVAL = rsv; |
619 | SvUTF8_on(sv); | |
620 | } else { | |
621 | RETVAL = &PL_sv_undef; | |
622 | } | |
623 | } | |
624 | OUTPUT: | |
625 | RETVAL | |
626 | ||
627 | SV * | |
4411f3b6 | 628 | _utf8_off(sv) |
2c674647 JH |
629 | SV * sv |
630 | CODE: | |
631 | { | |
632 | if (SvPOK(sv)) { | |
87714904 | 633 | SV *rsv = newSViv(SvUTF8(sv)); |
2c674647 JH |
634 | RETVAL = rsv; |
635 | SvUTF8_off(sv); | |
636 | } else { | |
637 | RETVAL = &PL_sv_undef; | |
638 | } | |
639 | } | |
640 | OUTPUT: | |
641 | RETVAL | |
642 | ||
33af2bc7 NIS |
643 | BOOT: |
644 | { | |
6a59c517 | 645 | #if defined(USE_PERLIO) && !defined(USE_SFIO) |
a999f61b | 646 | PerlIO_define_layer(aTHX_ &PerlIO_encode); |
33af2bc7 | 647 | #endif |
2f2b4ff2 NIS |
648 | #include "iso8859.def" |
649 | #include "EBCDIC.def" | |
650 | #include "Symbols.def" | |
33af2bc7 | 651 | } |