Commit | Line | Data |
---|---|---|
c6b7cc21 SH |
1 | |
2 | #define PERL_NO_GET_CONTEXT /* we want efficiency */ | |
3 | ||
4 | /* private functions which need pTHX_ and aTHX_ | |
5 | pv_cat_decompHangul | |
6 | sv_2pvunicode | |
7 | pv_utf8_decompose | |
8 | pv_utf8_reorder | |
9 | pv_utf8_compose | |
10 | */ | |
11 | ||
12 | #include "EXTERN.h" | |
13 | #include "perl.h" | |
14 | #include "XSUB.h" | |
15 | ||
16 | /* These 5 files are prepared by mkheader */ | |
17 | #include "unfcmb.h" | |
18 | #include "unfcan.h" | |
19 | #include "unfcpt.h" | |
20 | #include "unfcmp.h" | |
21 | #include "unfexc.h" | |
22 | ||
23 | /* The generated normalization tables since v5.20 are in native character set | |
24 | * terms. Prior to that, they were in Unicode terms. So we use 'uvchr' for | |
25 | * later perls, and redefine that to be 'uvuni' for earlier ones */ | |
26 | #if PERL_VERSION < 20 | |
27 | # undef uvchr_to_utf8 | |
28 | # ifdef uvuni_to_utf8 | |
29 | # define uvchr_to_utf8 uvuni_to_utf8 | |
30 | # else /* Perl 5.6.1 */ | |
31 | # define uvchr_to_utf8 uv_to_utf8 | |
32 | # endif | |
33 | ||
34 | # undef utf8n_to_uvchr | |
35 | # ifdef utf8n_to_uvuni | |
36 | # define utf8n_to_uvchr utf8n_to_uvuni | |
37 | # else /* Perl 5.6.1 */ | |
38 | # define utf8n_to_uvchr utf8_to_uv | |
39 | # endif | |
40 | #endif | |
41 | ||
42 | /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ | |
43 | #ifndef UTF8_ALLOW_BOM | |
44 | #define UTF8_ALLOW_BOM (0) | |
45 | #endif /* UTF8_ALLOW_BOM */ | |
46 | ||
47 | #ifndef UTF8_ALLOW_SURROGATE | |
48 | #define UTF8_ALLOW_SURROGATE (0) | |
49 | #endif /* UTF8_ALLOW_SURROGATE */ | |
50 | ||
51 | #ifndef UTF8_ALLOW_FE_FF | |
52 | #define UTF8_ALLOW_FE_FF (0) | |
53 | #endif /* UTF8_ALLOW_FE_FF */ | |
54 | ||
55 | #ifndef UTF8_ALLOW_FFFF | |
56 | #define UTF8_ALLOW_FFFF (0) | |
57 | #endif /* UTF8_ALLOW_FFFF */ | |
58 | ||
1ef95abd SH |
59 | #ifndef PERL_UNUSED_VAR |
60 | # define PERL_UNUSED_VAR(x) ((void)sizeof(x)) | |
61 | #endif | |
62 | ||
c6b7cc21 SH |
63 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF) |
64 | ||
65 | /* check if the string buffer is enough before uvchr_to_utf8(). */ | |
66 | /* dstart, d, and dlen should be defined outside before. */ | |
67 | #define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \ | |
68 | if (dlen < curlen + (need)) { \ | |
69 | dlen += (need); \ | |
70 | Renew(dstart, dlen+1, U8); \ | |
71 | d = dstart + curlen; \ | |
72 | } | |
73 | ||
74 | /* if utf8n_to_uvchr() sets retlen to 0 (if broken?) */ | |
75 | #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" | |
76 | ||
77 | /* utf8_hop() hops back before start. Maybe broken UTF-8 */ | |
78 | #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" | |
79 | ||
80 | /* At present, char > 0x10ffff are unaffected without complaint, right? */ | |
81 | #define VALID_UTF_MAX (0x10ffff) | |
82 | #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) | |
83 | ||
84 | /* size of array for combining characters */ | |
85 | /* enough as an initial value? */ | |
86 | #define CC_SEQ_SIZE (10) | |
87 | #define CC_SEQ_STEP (5) | |
88 | ||
89 | /* HANGUL begin */ | |
90 | #define Hangul_SBase 0xAC00 | |
91 | #define Hangul_SFinal 0xD7A3 | |
92 | #define Hangul_SCount 11172 | |
93 | ||
94 | #define Hangul_NCount 588 | |
95 | ||
96 | #define Hangul_LBase 0x1100 | |
97 | #define Hangul_LFinal 0x1112 | |
98 | #define Hangul_LCount 19 | |
99 | ||
100 | #define Hangul_VBase 0x1161 | |
101 | #define Hangul_VFinal 0x1175 | |
102 | #define Hangul_VCount 21 | |
103 | ||
104 | #define Hangul_TBase 0x11A7 | |
105 | #define Hangul_TFinal 0x11C2 | |
106 | #define Hangul_TCount 28 | |
107 | ||
108 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) | |
109 | #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) | |
110 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) | |
111 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) | |
112 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) | |
113 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) | |
114 | /* HANGUL end */ | |
115 | ||
116 | /* this is used for canonical ordering of combining characters (c.c.). */ | |
117 | typedef struct { | |
118 | U8 cc; /* combining class */ | |
119 | UV uv; /* codepoint */ | |
120 | STRLEN pos; /* position */ | |
121 | } UNF_cc; | |
122 | ||
123 | static int compare_cc(const void *a, const void *b) | |
124 | { | |
125 | int ret_cc; | |
126 | ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; | |
127 | if (ret_cc) | |
128 | return ret_cc; | |
129 | ||
130 | return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) | |
131 | - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); | |
132 | } | |
133 | ||
134 | static U8* dec_canonical(UV uv) | |
135 | { | |
136 | U8 ***plane, **row; | |
137 | if (OVER_UTF_MAX(uv)) | |
138 | return NULL; | |
139 | plane = (U8***)UNF_canon[uv >> 16]; | |
140 | if (! plane) | |
141 | return NULL; | |
142 | row = plane[(uv >> 8) & 0xff]; | |
143 | return row ? row[uv & 0xff] : NULL; | |
144 | } | |
145 | ||
146 | static U8* dec_compat(UV uv) | |
147 | { | |
148 | U8 ***plane, **row; | |
149 | if (OVER_UTF_MAX(uv)) | |
150 | return NULL; | |
151 | plane = (U8***)UNF_compat[uv >> 16]; | |
152 | if (! plane) | |
153 | return NULL; | |
154 | row = plane[(uv >> 8) & 0xff]; | |
155 | return row ? row[uv & 0xff] : NULL; | |
156 | } | |
157 | ||
158 | static UV composite_uv(UV uv, UV uv2) | |
159 | { | |
160 | UNF_complist ***plane, **row, *cell, *i; | |
161 | ||
162 | if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) | |
163 | return 0; | |
164 | ||
165 | if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { | |
166 | UV lindex = uv - Hangul_LBase; | |
167 | UV vindex = uv2 - Hangul_VBase; | |
168 | return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * | |
169 | Hangul_TCount); | |
170 | } | |
171 | if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { | |
172 | UV tindex = uv2 - Hangul_TBase; | |
173 | return(uv + tindex); | |
174 | } | |
175 | plane = UNF_compos[uv >> 16]; | |
176 | if (! plane) | |
177 | return 0; | |
178 | row = plane[(uv >> 8) & 0xff]; | |
179 | if (! row) | |
180 | return 0; | |
181 | cell = row[uv & 0xff]; | |
182 | if (! cell) | |
183 | return 0; | |
184 | for (i = cell; i->nextchar; i++) { | |
185 | if (uv2 == i->nextchar) | |
186 | return i->composite; | |
187 | } | |
188 | return 0; | |
189 | } | |
190 | ||
191 | static U8 getCombinClass(UV uv) | |
192 | { | |
193 | U8 **plane, *row; | |
194 | if (OVER_UTF_MAX(uv)) | |
195 | return 0; | |
196 | plane = (U8**)UNF_combin[uv >> 16]; | |
197 | if (! plane) | |
198 | return 0; | |
199 | row = plane[(uv >> 8) & 0xff]; | |
200 | return row ? row[uv & 0xff] : 0; | |
201 | } | |
202 | ||
203 | static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv) | |
204 | { | |
205 | UV sindex = uv - Hangul_SBase; | |
206 | UV lindex = sindex / Hangul_NCount; | |
207 | UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; | |
208 | UV tindex = sindex % Hangul_TCount; | |
209 | ||
210 | if (! Hangul_IsS(uv)) | |
211 | return d; | |
212 | ||
213 | d = uvchr_to_utf8(d, (lindex + Hangul_LBase)); | |
214 | d = uvchr_to_utf8(d, (vindex + Hangul_VBase)); | |
215 | if (tindex) | |
216 | d = uvchr_to_utf8(d, (tindex + Hangul_TBase)); | |
217 | return d; | |
218 | } | |
219 | ||
220 | static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp) | |
221 | { | |
222 | char *s; | |
223 | STRLEN len; | |
224 | s = SvPV(sv,len); | |
225 | if (!SvUTF8(sv)) { | |
226 | SV* tmpsv = sv_2mortal(newSVpvn(s, len)); | |
227 | if (!SvPOK(tmpsv)) | |
228 | s = SvPV_force(tmpsv,len); | |
229 | sv_utf8_upgrade(tmpsv); | |
230 | s = SvPV(tmpsv,len); | |
231 | } | |
232 | if (lp) | |
233 | *lp = len; | |
234 | return s; | |
235 | } | |
236 | ||
237 | static | |
238 | U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) | |
239 | { | |
240 | U8* p = s; | |
241 | U8* e = s + slen; | |
242 | U8* dstart = *dp; | |
243 | U8* d = dstart; | |
244 | ||
245 | while (p < e) { | |
246 | STRLEN retlen; | |
247 | UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); | |
248 | if (!retlen) | |
249 | croak(ErrRetlenIsZero, "decompose"); | |
250 | p += retlen; | |
251 | ||
252 | if (Hangul_IsS(uv)) { | |
253 | Renew_d_if_not_enough_to(UTF8_MAXLEN * 3) | |
254 | d = pv_cat_decompHangul(aTHX_ d, uv); | |
255 | } | |
256 | else { | |
257 | U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); | |
258 | ||
259 | if (r) { | |
260 | STRLEN len = (STRLEN)strlen((char *)r); | |
261 | Renew_d_if_not_enough_to(len) | |
262 | while (len--) | |
263 | *d++ = *r++; | |
264 | } | |
265 | else { | |
266 | Renew_d_if_not_enough_to(UTF8_MAXLEN) | |
267 | d = uvchr_to_utf8(d, uv); | |
268 | } | |
269 | } | |
270 | } | |
271 | *dp = dstart; | |
272 | return d; | |
273 | } | |
274 | ||
275 | static | |
276 | U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen) | |
277 | { | |
278 | U8* p = s; | |
279 | U8* e = s + slen; | |
280 | U8* dstart = *dp; | |
281 | U8* d = dstart; | |
282 | ||
283 | UNF_cc seq_ary[CC_SEQ_SIZE]; | |
284 | UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ | |
285 | UNF_cc* seq_ext = NULL; /* extend if need */ | |
286 | STRLEN seq_max = CC_SEQ_SIZE; | |
287 | STRLEN cc_pos = 0; | |
288 | ||
289 | while (p < e) { | |
290 | U8 curCC; | |
291 | STRLEN retlen; | |
292 | UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); | |
293 | if (!retlen) | |
294 | croak(ErrRetlenIsZero, "reorder"); | |
295 | p += retlen; | |
296 | ||
297 | curCC = getCombinClass(uv); | |
298 | ||
299 | if (curCC != 0) { | |
300 | if (seq_max < cc_pos + 1) { /* extend if need */ | |
301 | seq_max = cc_pos + CC_SEQ_STEP; /* new size */ | |
302 | if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ | |
303 | STRLEN i; | |
304 | New(0, seq_ext, seq_max, UNF_cc); | |
305 | for (i = 0; i < cc_pos; i++) | |
306 | seq_ext[i] = seq_ary[i]; | |
307 | } | |
308 | else { | |
309 | Renew(seq_ext, seq_max, UNF_cc); | |
310 | } | |
311 | seq_ptr = seq_ext; /* use seq_ext from now */ | |
312 | } | |
313 | ||
314 | seq_ptr[cc_pos].cc = curCC; | |
315 | seq_ptr[cc_pos].uv = uv; | |
316 | seq_ptr[cc_pos].pos = cc_pos; | |
317 | ++cc_pos; | |
318 | ||
319 | if (p < e) | |
320 | continue; | |
321 | } | |
322 | ||
323 | /* output */ | |
324 | if (cc_pos) { | |
325 | STRLEN i; | |
326 | ||
327 | if (cc_pos > 1) /* reordered if there are two c.c.'s */ | |
328 | qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); | |
329 | ||
330 | for (i = 0; i < cc_pos; i++) { | |
331 | Renew_d_if_not_enough_to(UTF8_MAXLEN) | |
332 | d = uvchr_to_utf8(d, seq_ptr[i].uv); | |
333 | } | |
334 | cc_pos = 0; | |
335 | } | |
336 | ||
337 | if (curCC == 0) { | |
338 | Renew_d_if_not_enough_to(UTF8_MAXLEN) | |
339 | d = uvchr_to_utf8(d, uv); | |
340 | } | |
341 | } | |
342 | if (seq_ext) | |
343 | Safefree(seq_ext); | |
344 | *dp = dstart; | |
345 | return d; | |
346 | } | |
347 | ||
348 | static | |
349 | U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig) | |
350 | { | |
351 | U8* p = s; | |
352 | U8* e = s + slen; | |
353 | U8* dstart = *dp; | |
354 | U8* d = dstart; | |
355 | ||
356 | UV uvS = 0; /* code point of the starter */ | |
357 | bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ | |
358 | U8 preCC = 0; | |
359 | ||
360 | UV seq_ary[CC_SEQ_SIZE]; | |
361 | UV* seq_ptr = seq_ary; /* use array at the beginning */ | |
362 | UV* seq_ext = NULL; /* extend if need */ | |
363 | STRLEN seq_max = CC_SEQ_SIZE; | |
364 | STRLEN cc_pos = 0; | |
365 | ||
366 | while (p < e) { | |
367 | U8 curCC; | |
368 | STRLEN retlen; | |
369 | UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); | |
370 | if (!retlen) | |
371 | croak(ErrRetlenIsZero, "compose"); | |
372 | p += retlen; | |
373 | ||
374 | curCC = getCombinClass(uv); | |
375 | ||
376 | if (!valid_uvS) { | |
377 | if (curCC == 0) { | |
378 | uvS = uv; /* the first Starter is found */ | |
379 | valid_uvS = TRUE; | |
380 | if (p < e) | |
381 | continue; | |
382 | } | |
383 | else { | |
384 | Renew_d_if_not_enough_to(UTF8_MAXLEN) | |
385 | d = uvchr_to_utf8(d, uv); | |
386 | continue; | |
387 | } | |
388 | } | |
389 | else { | |
390 | bool composed; | |
391 | ||
392 | /* blocked */ | |
393 | if ((iscontig && cc_pos) || /* discontiguous combination */ | |
394 | (curCC != 0 && preCC == curCC) || /* blocked by same CC */ | |
395 | (preCC > curCC)) /* blocked by higher CC: revised D2 */ | |
396 | composed = FALSE; | |
397 | ||
398 | /* not blocked: | |
399 | iscontig && cc_pos == 0 -- contiguous combination | |
400 | curCC == 0 && preCC == 0 -- starter + starter | |
401 | curCC != 0 && preCC < curCC -- lower CC */ | |
402 | else { | |
403 | /* try composition */ | |
404 | UV uvComp = composite_uv(uvS, uv); | |
405 | ||
406 | if (uvComp && !isExclusion(uvComp)) { | |
407 | uvS = uvComp; | |
408 | composed = TRUE; | |
409 | ||
410 | /* preCC should not be changed to curCC */ | |
411 | /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ | |
412 | if (p < e) | |
413 | continue; | |
414 | } | |
415 | else | |
416 | composed = FALSE; | |
417 | } | |
418 | ||
419 | if (!composed) { | |
420 | preCC = curCC; | |
421 | if (curCC != 0 || !(p < e)) { | |
422 | if (seq_max < cc_pos + 1) { /* extend if need */ | |
423 | seq_max = cc_pos + CC_SEQ_STEP; /* new size */ | |
424 | if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ | |
425 | New(0, seq_ext, seq_max, UV); | |
426 | Copy(seq_ary, seq_ext, cc_pos, UV); | |
427 | } | |
428 | else { | |
429 | Renew(seq_ext, seq_max, UV); | |
430 | } | |
431 | seq_ptr = seq_ext; /* use seq_ext from now */ | |
432 | } | |
433 | seq_ptr[cc_pos] = uv; | |
434 | ++cc_pos; | |
435 | } | |
436 | if (curCC != 0 && p < e) | |
437 | continue; | |
438 | } | |
439 | } | |
440 | ||
441 | /* output */ | |
442 | { | |
443 | Renew_d_if_not_enough_to(UTF8_MAXLEN) | |
444 | d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */ | |
445 | } | |
446 | ||
447 | if (cc_pos) { | |
448 | STRLEN i; | |
449 | ||
450 | for (i = 0; i < cc_pos; i++) { | |
451 | Renew_d_if_not_enough_to(UTF8_MAXLEN) | |
452 | d = uvchr_to_utf8(d, seq_ptr[i]); | |
453 | } | |
454 | cc_pos = 0; | |
455 | } | |
456 | ||
457 | uvS = uv; | |
458 | } | |
459 | if (seq_ext) | |
460 | Safefree(seq_ext); | |
461 | *dp = dstart; | |
462 | return d; | |
463 | } | |
464 | ||
465 | MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize | |
466 | ||
467 | SV* | |
468 | decompose(src, compat = &PL_sv_no) | |
469 | SV * src | |
470 | SV * compat | |
471 | PROTOTYPE: $;$ | |
472 | PREINIT: | |
473 | SV* dst; | |
474 | U8 *s, *d, *dend; | |
475 | STRLEN slen, dlen; | |
476 | CODE: | |
477 | s = (U8*)sv_2pvunicode(aTHX_ src,&slen); | |
478 | dst = newSVpvn("", 0); | |
479 | dlen = slen; | |
480 | New(0, d, dlen+1, U8); | |
481 | dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat)); | |
482 | sv_setpvn(dst, (char *)d, dend - d); | |
483 | SvUTF8_on(dst); | |
484 | Safefree(d); | |
485 | RETVAL = dst; | |
486 | OUTPUT: | |
487 | RETVAL | |
488 | ||
489 | ||
490 | SV* | |
491 | reorder(src) | |
492 | SV * src | |
493 | PROTOTYPE: $ | |
494 | PREINIT: | |
495 | SV* dst; | |
496 | U8 *s, *d, *dend; | |
497 | STRLEN slen, dlen; | |
498 | CODE: | |
499 | s = (U8*)sv_2pvunicode(aTHX_ src,&slen); | |
500 | dst = newSVpvn("", 0); | |
501 | dlen = slen; | |
502 | New(0, d, dlen+1, U8); | |
503 | dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen); | |
504 | sv_setpvn(dst, (char *)d, dend - d); | |
505 | SvUTF8_on(dst); | |
506 | Safefree(d); | |
507 | RETVAL = dst; | |
508 | OUTPUT: | |
509 | RETVAL | |
510 | ||
511 | ||
512 | SV* | |
513 | compose(src) | |
514 | SV * src | |
515 | PROTOTYPE: $ | |
516 | ALIAS: | |
517 | composeContiguous = 1 | |
518 | PREINIT: | |
519 | SV* dst; | |
520 | U8 *s, *d, *dend; | |
521 | STRLEN slen, dlen; | |
522 | CODE: | |
523 | s = (U8*)sv_2pvunicode(aTHX_ src,&slen); | |
524 | dst = newSVpvn("", 0); | |
525 | dlen = slen; | |
526 | New(0, d, dlen+1, U8); | |
527 | dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix); | |
528 | sv_setpvn(dst, (char *)d, dend - d); | |
529 | SvUTF8_on(dst); | |
530 | Safefree(d); | |
531 | RETVAL = dst; | |
532 | OUTPUT: | |
533 | RETVAL | |
534 | ||
535 | ||
536 | SV* | |
537 | NFD(src) | |
538 | SV * src | |
539 | PROTOTYPE: $ | |
540 | ALIAS: | |
541 | NFKD = 1 | |
542 | PREINIT: | |
543 | SV *dst; | |
544 | U8 *s, *t, *tend, *d, *dend; | |
545 | STRLEN slen, tlen, dlen; | |
546 | CODE: | |
547 | s = (U8*)sv_2pvunicode(aTHX_ src,&slen); | |
548 | ||
549 | /* decompose */ | |
550 | tlen = slen; | |
551 | New(0, t, tlen+1, U8); | |
552 | tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); | |
553 | *tend = '\0'; | |
554 | tlen = tend - t; /* no longer know real size of t */ | |
555 | ||
556 | /* reorder */ | |
557 | dlen = tlen; | |
558 | New(0, d, dlen+1, U8); | |
559 | dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen); | |
560 | *dend = '\0'; | |
561 | dlen = dend - d; /* no longer know real size of d */ | |
562 | ||
563 | /* return */ | |
564 | dst = newSVpvn("", 0); | |
565 | sv_setpvn(dst, (char *)d, dlen); | |
566 | SvUTF8_on(dst); | |
567 | ||
568 | Safefree(t); | |
569 | Safefree(d); | |
570 | RETVAL = dst; | |
571 | OUTPUT: | |
572 | RETVAL | |
573 | ||
574 | ||
575 | SV* | |
576 | NFC(src) | |
577 | SV * src | |
578 | PROTOTYPE: $ | |
579 | ALIAS: | |
580 | NFKC = 1 | |
581 | FCC = 2 | |
582 | PREINIT: | |
583 | SV *dst; | |
584 | U8 *s, *t, *tend, *u, *uend, *d, *dend; | |
585 | STRLEN slen, tlen, ulen, dlen; | |
586 | CODE: | |
587 | s = (U8*)sv_2pvunicode(aTHX_ src,&slen); | |
588 | ||
589 | /* decompose */ | |
590 | tlen = slen; | |
591 | New(0, t, tlen+1, U8); | |
592 | tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); | |
593 | *tend = '\0'; | |
594 | tlen = tend - t; /* no longer know real size of t */ | |
595 | ||
596 | /* reorder */ | |
597 | ulen = tlen; | |
598 | New(0, u, ulen+1, U8); | |
599 | uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen); | |
600 | *uend = '\0'; | |
601 | ulen = uend - u; /* no longer know real size of u */ | |
602 | ||
603 | /* compose */ | |
604 | dlen = ulen; | |
605 | New(0, d, dlen+1, U8); | |
606 | dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2)); | |
607 | *dend = '\0'; | |
608 | dlen = dend - d; /* no longer know real size of d */ | |
609 | ||
610 | /* return */ | |
611 | dst = newSVpvn("", 0); | |
612 | sv_setpvn(dst, (char *)d, dlen); | |
613 | SvUTF8_on(dst); | |
614 | ||
615 | Safefree(t); | |
616 | Safefree(u); | |
617 | Safefree(d); | |
618 | RETVAL = dst; | |
619 | OUTPUT: | |
620 | RETVAL | |
621 | ||
622 | ||
623 | SV* | |
624 | checkNFD(src) | |
625 | SV * src | |
626 | PROTOTYPE: $ | |
627 | ALIAS: | |
628 | checkNFKD = 1 | |
629 | PREINIT: | |
630 | STRLEN srclen, retlen; | |
631 | U8 *s, *e, *p, curCC, preCC; | |
632 | bool result = TRUE; | |
633 | CODE: | |
634 | s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); | |
635 | e = s + srclen; | |
636 | ||
637 | preCC = 0; | |
638 | for (p = s; p < e; p += retlen) { | |
639 | UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); | |
640 | if (!retlen) | |
641 | croak(ErrRetlenIsZero, "checkNFD or -NFKD"); | |
642 | ||
643 | curCC = getCombinClass(uv); | |
644 | if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ | |
645 | result = FALSE; | |
646 | break; | |
647 | } | |
648 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { | |
649 | result = FALSE; | |
650 | break; | |
651 | } | |
652 | preCC = curCC; | |
653 | } | |
654 | RETVAL = boolSV(result); | |
655 | OUTPUT: | |
656 | RETVAL | |
657 | ||
658 | ||
659 | SV* | |
660 | checkNFC(src) | |
661 | SV * src | |
662 | PROTOTYPE: $ | |
663 | ALIAS: | |
664 | checkNFKC = 1 | |
665 | PREINIT: | |
666 | STRLEN srclen, retlen; | |
667 | U8 *s, *e, *p, curCC, preCC; | |
668 | bool result = TRUE; | |
669 | bool isMAYBE = FALSE; | |
670 | CODE: | |
671 | s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); | |
672 | e = s + srclen; | |
673 | ||
674 | preCC = 0; | |
675 | for (p = s; p < e; p += retlen) { | |
676 | UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); | |
677 | if (!retlen) | |
678 | croak(ErrRetlenIsZero, "checkNFC or -NFKC"); | |
679 | ||
680 | curCC = getCombinClass(uv); | |
681 | if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ | |
682 | result = FALSE; | |
683 | break; | |
684 | } | |
685 | ||
686 | /* get NFC/NFKC property */ | |
687 | if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ | |
688 | ; /* YES */ | |
689 | else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { | |
690 | result = FALSE; | |
691 | break; | |
692 | } | |
693 | else if (isComp2nd(uv)) | |
694 | isMAYBE = TRUE; | |
695 | else if (ix) { | |
696 | char *canon, *compat; | |
697 | /* NFKC_NO when having compatibility mapping. */ | |
698 | canon = (char *) dec_canonical(uv); | |
699 | compat = (char *) dec_compat(uv); | |
700 | if (compat && !(canon && strEQ(canon, compat))) { | |
701 | result = FALSE; | |
702 | break; | |
703 | } | |
704 | } /* end of get NFC/NFKC property */ | |
705 | ||
706 | preCC = curCC; | |
707 | } | |
708 | if (isMAYBE && result) /* NO precedes MAYBE */ | |
709 | XSRETURN_UNDEF; | |
710 | RETVAL = boolSV(result); | |
711 | OUTPUT: | |
712 | RETVAL | |
713 | ||
714 | ||
715 | SV* | |
716 | checkFCD(src) | |
717 | SV * src | |
718 | PROTOTYPE: $ | |
719 | ALIAS: | |
720 | checkFCC = 1 | |
721 | PREINIT: | |
722 | STRLEN srclen, retlen; | |
723 | U8 *s, *e, *p, curCC, preCC; | |
724 | bool result = TRUE; | |
725 | bool isMAYBE = FALSE; | |
726 | CODE: | |
727 | s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); | |
728 | e = s + srclen; | |
729 | preCC = 0; | |
730 | for (p = s; p < e; p += retlen) { | |
731 | U8 *sCan; | |
732 | UV uvLead; | |
733 | STRLEN canlen = 0; | |
734 | UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); | |
735 | if (!retlen) | |
736 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); | |
737 | ||
738 | sCan = (U8*) dec_canonical(uv); | |
739 | ||
740 | if (sCan) { | |
741 | STRLEN canret; | |
742 | canlen = (STRLEN)strlen((char *) sCan); | |
743 | uvLead = utf8n_to_uvchr(sCan, canlen, &canret, AllowAnyUTF); | |
744 | if (!canret) | |
745 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); | |
746 | } | |
747 | else { | |
748 | uvLead = uv; | |
749 | } | |
750 | ||
751 | curCC = getCombinClass(uvLead); | |
752 | ||
753 | if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ | |
754 | result = FALSE; | |
755 | break; | |
756 | } | |
757 | ||
758 | if (ix) { | |
759 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { | |
760 | result = FALSE; | |
761 | break; | |
762 | } | |
763 | else if (isComp2nd(uv)) | |
764 | isMAYBE = TRUE; | |
765 | } | |
766 | ||
767 | if (sCan) { | |
768 | STRLEN canret; | |
769 | UV uvTrail; | |
770 | U8* eCan = sCan + canlen; | |
771 | U8* pCan = utf8_hop(eCan, -1); | |
772 | if (pCan < sCan) | |
773 | croak(ErrHopBeforeStart); | |
774 | uvTrail = utf8n_to_uvchr(pCan, eCan - pCan, &canret, AllowAnyUTF); | |
775 | if (!canret) | |
776 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); | |
777 | preCC = getCombinClass(uvTrail); | |
778 | } | |
779 | else { | |
780 | preCC = curCC; | |
781 | } | |
782 | } | |
783 | if (isMAYBE && result) /* NO precedes MAYBE */ | |
784 | XSRETURN_UNDEF; | |
785 | RETVAL = boolSV(result); | |
786 | OUTPUT: | |
787 | RETVAL | |
788 | ||
789 | ||
790 | U8 | |
791 | getCombinClass(uv) | |
792 | UV uv | |
793 | PROTOTYPE: $ | |
794 | ||
795 | bool | |
796 | isExclusion(uv) | |
797 | UV uv | |
798 | PROTOTYPE: $ | |
799 | ||
800 | bool | |
801 | isSingleton(uv) | |
802 | UV uv | |
803 | PROTOTYPE: $ | |
804 | ||
805 | bool | |
806 | isNonStDecomp(uv) | |
807 | UV uv | |
808 | PROTOTYPE: $ | |
809 | ||
810 | bool | |
811 | isComp2nd(uv) | |
812 | UV uv | |
813 | PROTOTYPE: $ | |
814 | ALIAS: | |
815 | isNFC_MAYBE = 1 | |
816 | isNFKC_MAYBE = 2 | |
662aea32 CBW |
817 | INIT: |
818 | PERL_UNUSED_VAR(ix); | |
c6b7cc21 SH |
819 | |
820 | SV* | |
821 | isNFD_NO(uv) | |
822 | UV uv | |
823 | PROTOTYPE: $ | |
824 | ALIAS: | |
825 | isNFKD_NO = 1 | |
826 | PREINIT: | |
827 | bool result = FALSE; | |
828 | CODE: | |
829 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) | |
830 | result = TRUE; /* NFD_NO or NFKD_NO */ | |
831 | RETVAL = boolSV(result); | |
832 | OUTPUT: | |
833 | RETVAL | |
834 | ||
835 | ||
836 | SV* | |
837 | isComp_Ex(uv) | |
838 | UV uv | |
839 | PROTOTYPE: $ | |
840 | ALIAS: | |
841 | isNFC_NO = 0 | |
842 | isNFKC_NO = 1 | |
843 | PREINIT: | |
844 | bool result = FALSE; | |
845 | CODE: | |
846 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) | |
847 | result = TRUE; /* NFC_NO or NFKC_NO */ | |
848 | else if (ix) { | |
849 | char *canon, *compat; | |
850 | canon = (char *) dec_canonical(uv); | |
851 | compat = (char *) dec_compat(uv); | |
852 | if (compat && (!canon || strNE(canon, compat))) | |
853 | result = TRUE; /* NFC_NO or NFKC_NO */ | |
854 | } | |
855 | RETVAL = boolSV(result); | |
856 | OUTPUT: | |
857 | RETVAL | |
858 | ||
859 | SV* | |
860 | getComposite(uv, uv2) | |
861 | UV uv | |
862 | UV uv2 | |
863 | PROTOTYPE: $$ | |
864 | PREINIT: | |
865 | UV composite; | |
866 | CODE: | |
867 | composite = composite_uv(uv, uv2); | |
868 | RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; | |
869 | OUTPUT: | |
870 | RETVAL | |
871 | ||
872 | ||
873 | ||
874 | SV* | |
875 | getCanon(uv) | |
876 | UV uv | |
877 | PROTOTYPE: $ | |
878 | ALIAS: | |
879 | getCompat = 1 | |
880 | CODE: | |
881 | if (Hangul_IsS(uv)) { | |
882 | U8 tmp[3 * UTF8_MAXLEN + 1]; | |
883 | U8 *t = tmp; | |
884 | U8 *e = pv_cat_decompHangul(aTHX_ t, uv); | |
885 | RETVAL = newSVpvn((char *)t, e - t); | |
886 | } else { | |
887 | U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); | |
888 | if (!rstr) | |
889 | XSRETURN_UNDEF; | |
890 | RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); | |
891 | } | |
892 | SvUTF8_on(RETVAL); | |
893 | OUTPUT: | |
894 | RETVAL | |
895 | ||
896 | ||
897 | void | |
898 | splitOnLastStarter(src) | |
899 | SV * src | |
900 | PREINIT: | |
901 | SV *svp; | |
902 | STRLEN srclen; | |
903 | U8 *s, *e, *p; | |
904 | PPCODE: | |
905 | s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); | |
906 | e = s + srclen; | |
907 | p = e; | |
908 | while (s < p) { | |
909 | UV uv; | |
910 | p = utf8_hop(p, -1); | |
911 | if (p < s) | |
912 | croak(ErrHopBeforeStart); | |
913 | uv = utf8n_to_uvchr(p, e - p, NULL, AllowAnyUTF); | |
914 | if (getCombinClass(uv) == 0) /* Last Starter found */ | |
915 | break; | |
916 | } | |
917 | ||
918 | svp = sv_2mortal(newSVpvn((char*)s, p - s)); | |
919 | SvUTF8_on(svp); | |
920 | XPUSHs(svp); | |
921 | ||
922 | svp = sv_2mortal(newSVpvn((char*)p, e - p)); | |
923 | SvUTF8_on(svp); | |
924 | XPUSHs(svp); | |
925 |