Commit | Line | Data |
---|---|---|
ac5ea531 JH |
1 | |
2 | #include "EXTERN.h" | |
3 | #include "perl.h" | |
4 | #include "XSUB.h" | |
5 | ||
6 | /* These 5 files are prepared by mkheader */ | |
7 | #include "unfcmb.h" | |
8 | #include "unfcan.h" | |
9 | #include "unfcpt.h" | |
10 | #include "unfcmp.h" | |
11 | #include "unfexc.h" | |
12 | ||
13 | /* Perl 5.6.1 ? */ | |
14 | #ifndef uvuni_to_utf8 | |
15 | #define uvuni_to_utf8 uv_to_utf8 | |
6c941e0c | 16 | #endif /* uvuni_to_utf8 */ |
ac5ea531 JH |
17 | |
18 | /* Perl 5.6.1 ? */ | |
ab8fe378 JH |
19 | #ifndef utf8n_to_uvuni |
20 | #define utf8n_to_uvuni utf8_to_uv | |
6c941e0c | 21 | #endif /* utf8n_to_uvuni */ |
ac5ea531 | 22 | |
e524f5b2 NC |
23 | /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ |
24 | #ifdef UTF8_ALLOW_BOM | |
25 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF) | |
628bbff0 | 26 | #else |
e524f5b2 NC |
27 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF) |
28 | #endif | |
29 | ||
30 | /* if utf8n_to_uvuni() sets retlen to 0 (?) */ | |
fe067ad9 | 31 | #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" |
82e740b6 NC |
32 | |
33 | /* utf8_hop() hops back before start. Maybe broken UTF-8 */ | |
34 | #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" | |
35 | ||
51683ce6 TS |
36 | /* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC. |
37 | If Unicode would add a new composition of A + B to C | |
38 | where bytes::length(A) + bytes::length(B) < bytes::length(C), | |
39 | this code should be fixed. | |
40 | In this case, mkheader will prevent Unicode::Normalize from building. */ | |
fe067ad9 SP |
41 | #define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source" |
42 | ||
43 | /* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */ | |
44 | #define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough" | |
45 | ||
ac5ea531 JH |
46 | /* At present, char > 0x10ffff are unaffected without complaint, right? */ |
47 | #define VALID_UTF_MAX (0x10ffff) | |
48 | #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) | |
49 | ||
fe067ad9 SP |
50 | /* size of array for combining characters */ |
51 | /* enough as an initial value? */ | |
52 | #define CC_SEQ_SIZE (10) | |
53 | #define CC_SEQ_STEP (5) | |
54 | ||
55 | /* HANGUL begin */ | |
ac5ea531 JH |
56 | #define Hangul_SBase 0xAC00 |
57 | #define Hangul_SFinal 0xD7A3 | |
58 | #define Hangul_SCount 11172 | |
59 | ||
60 | #define Hangul_NCount 588 | |
61 | ||
62 | #define Hangul_LBase 0x1100 | |
63 | #define Hangul_LFinal 0x1112 | |
64 | #define Hangul_LCount 19 | |
65 | ||
66 | #define Hangul_VBase 0x1161 | |
67 | #define Hangul_VFinal 0x1175 | |
68 | #define Hangul_VCount 21 | |
69 | ||
70 | #define Hangul_TBase 0x11A7 | |
71 | #define Hangul_TFinal 0x11C2 | |
72 | #define Hangul_TCount 28 | |
73 | ||
74 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) | |
2a204b45 | 75 | #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) |
ac5ea531 JH |
76 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) |
77 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) | |
78 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) | |
79 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) | |
fe067ad9 | 80 | /* HANGUL end */ |
ac5ea531 JH |
81 | |
82 | /* this is used for canonical ordering of combining characters (c.c.). */ | |
83 | typedef struct { | |
84 | U8 cc; /* combining class */ | |
85 | UV uv; /* codepoint */ | |
86 | STRLEN pos; /* position */ | |
87 | } UNF_cc; | |
88 | ||
fe067ad9 | 89 | static int compare_cc(const void *a, const void *b) |
ac5ea531 JH |
90 | { |
91 | int ret_cc; | |
6c941e0c | 92 | ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; |
8f118dcd JH |
93 | if (ret_cc) |
94 | return ret_cc; | |
6c941e0c JH |
95 | |
96 | return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) | |
97 | - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); | |
ac5ea531 JH |
98 | } |
99 | ||
fe067ad9 | 100 | static U8* dec_canonical(UV uv) |
ac5ea531 JH |
101 | { |
102 | U8 ***plane, **row; | |
8f118dcd JH |
103 | if (OVER_UTF_MAX(uv)) |
104 | return NULL; | |
ac5ea531 | 105 | plane = (U8***)UNF_canon[uv >> 16]; |
8f118dcd JH |
106 | if (! plane) |
107 | return NULL; | |
ac5ea531 JH |
108 | row = plane[(uv >> 8) & 0xff]; |
109 | return row ? row[uv & 0xff] : NULL; | |
110 | } | |
111 | ||
fe067ad9 | 112 | static U8* dec_compat(UV uv) |
ac5ea531 JH |
113 | { |
114 | U8 ***plane, **row; | |
8f118dcd JH |
115 | if (OVER_UTF_MAX(uv)) |
116 | return NULL; | |
ac5ea531 | 117 | plane = (U8***)UNF_compat[uv >> 16]; |
8f118dcd JH |
118 | if (! plane) |
119 | return NULL; | |
ac5ea531 JH |
120 | row = plane[(uv >> 8) & 0xff]; |
121 | return row ? row[uv & 0xff] : NULL; | |
122 | } | |
123 | ||
fe067ad9 | 124 | static UV composite_uv(UV uv, UV uv2) |
ac5ea531 JH |
125 | { |
126 | UNF_complist ***plane, **row, *cell, *i; | |
127 | ||
fe067ad9 | 128 | if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) |
8f118dcd | 129 | return 0; |
ac5ea531 | 130 | |
8f118dcd | 131 | if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { |
fe067ad9 SP |
132 | UV lindex = uv - Hangul_LBase; |
133 | UV vindex = uv2 - Hangul_VBase; | |
134 | return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * | |
135 | Hangul_TCount); | |
ac5ea531 | 136 | } |
8f118dcd | 137 | if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { |
fe067ad9 SP |
138 | UV tindex = uv2 - Hangul_TBase; |
139 | return(uv + tindex); | |
ac5ea531 JH |
140 | } |
141 | plane = UNF_compos[uv >> 16]; | |
8f118dcd JH |
142 | if (! plane) |
143 | return 0; | |
ac5ea531 | 144 | row = plane[(uv >> 8) & 0xff]; |
8f118dcd JH |
145 | if (! row) |
146 | return 0; | |
ac5ea531 | 147 | cell = row[uv & 0xff]; |
8f118dcd JH |
148 | if (! cell) |
149 | return 0; | |
150 | for (i = cell; i->nextchar; i++) { | |
151 | if (uv2 == i->nextchar) | |
152 | return i->composite; | |
ac5ea531 JH |
153 | } |
154 | return 0; | |
155 | } | |
156 | ||
fe067ad9 | 157 | static U8 getCombinClass(UV uv) |
ac5ea531 JH |
158 | { |
159 | U8 **plane, *row; | |
8f118dcd JH |
160 | if (OVER_UTF_MAX(uv)) |
161 | return 0; | |
ac5ea531 | 162 | plane = (U8**)UNF_combin[uv >> 16]; |
8f118dcd JH |
163 | if (! plane) |
164 | return 0; | |
ac5ea531 JH |
165 | row = plane[(uv >> 8) & 0xff]; |
166 | return row ? row[uv & 0xff] : 0; | |
167 | } | |
168 | ||
fe067ad9 | 169 | static U8* pv_cat_decompHangul(U8* d, UV uv) |
ac5ea531 | 170 | { |
fe067ad9 SP |
171 | UV sindex = uv - Hangul_SBase; |
172 | UV lindex = sindex / Hangul_NCount; | |
173 | UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; | |
174 | UV tindex = sindex % Hangul_TCount; | |
ac5ea531 | 175 | |
8f118dcd | 176 | if (! Hangul_IsS(uv)) |
fe067ad9 | 177 | return d; |
ac5ea531 | 178 | |
fe067ad9 SP |
179 | d = uvuni_to_utf8(d, (lindex + Hangul_LBase)); |
180 | d = uvuni_to_utf8(d, (vindex + Hangul_VBase)); | |
8f118dcd | 181 | if (tindex) |
fe067ad9 SP |
182 | d = uvuni_to_utf8(d, (tindex + Hangul_TBase)); |
183 | return d; | |
ac5ea531 JH |
184 | } |
185 | ||
39f4556f | 186 | static char* sv_2pvunicode(SV *sv, STRLEN *lp) |
a092bcfd RGS |
187 | { |
188 | char *s; | |
189 | STRLEN len; | |
39f4556f | 190 | s = SvPV(sv,len); |
a092bcfd | 191 | if (!SvUTF8(sv)) { |
39f4556f | 192 | SV* tmpsv = sv_2mortal(newSVpvn(s, len)); |
a092bcfd | 193 | if (!SvPOK(tmpsv)) |
39f4556f | 194 | s = SvPV_force(tmpsv,len); |
a092bcfd | 195 | sv_utf8_upgrade(tmpsv); |
39f4556f | 196 | s = SvPV(tmpsv,len); |
a092bcfd | 197 | } |
fe067ad9 SP |
198 | if (lp) |
199 | *lp = len; | |
a092bcfd RGS |
200 | return s; |
201 | } | |
202 | ||
fe067ad9 SP |
203 | static |
204 | U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) | |
205 | { | |
206 | U8* p = s; | |
207 | U8* e = s + slen; | |
208 | U8* dstart = *dp; | |
209 | U8* d = dstart; | |
210 | ||
211 | while (p < e) { | |
212 | STRLEN retlen; | |
213 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); | |
82e740b6 | 214 | if (!retlen) |
fe067ad9 SP |
215 | croak(ErrRetlenIsZero, "decompose"); |
216 | p += retlen; | |
217 | ||
218 | if (Hangul_IsS(uv)) { | |
219 | STRLEN cur = d - dstart; | |
82e740b6 | 220 | |
fe067ad9 SP |
221 | if (dlen < cur + UTF8_MAXLEN * 3) { |
222 | dlen += UTF8_MAXLEN * 3; | |
223 | Renew(dstart, dlen+1, U8); | |
224 | d = dstart + cur; | |
225 | } | |
226 | d = pv_cat_decompHangul(d, uv); | |
227 | } | |
ac5ea531 | 228 | else { |
fe067ad9 SP |
229 | U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); |
230 | ||
231 | if (r) { | |
232 | STRLEN len = (STRLEN)strlen((char *)r); | |
233 | STRLEN cur = d - dstart; | |
234 | if (dlen < cur + len) { | |
235 | dlen += len; | |
236 | Renew(dstart, dlen+1, U8); | |
237 | d = dstart + cur; | |
238 | } | |
239 | while (len--) | |
240 | *d++ = *r++; | |
241 | } | |
242 | else { | |
243 | STRLEN cur = d - dstart; | |
244 | ||
245 | if (dlen < cur + UTF8_MAXLEN) { | |
246 | dlen += UTF8_MAXLEN; | |
247 | Renew(dstart, dlen+1, U8); | |
248 | d = dstart + cur; | |
249 | } | |
250 | d = uvuni_to_utf8(d, uv); | |
251 | } | |
ac5ea531 JH |
252 | } |
253 | } | |
fe067ad9 SP |
254 | *dp = dstart; |
255 | return d; | |
256 | } | |
ac5ea531 | 257 | |
fe067ad9 SP |
258 | static |
259 | U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen) | |
260 | { | |
261 | U8* p = s; | |
262 | U8* e = s + slen; | |
263 | U8* dend = d + dlen; | |
264 | ||
265 | UNF_cc seq_ary[CC_SEQ_SIZE]; | |
266 | UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ | |
267 | UNF_cc* seq_ext = NULL; /* extend if need */ | |
268 | STRLEN seq_max = CC_SEQ_SIZE; | |
269 | STRLEN cc_pos = 0; | |
270 | ||
271 | if (dlen < slen || dlen < slen + UTF8_MAXLEN) | |
272 | croak(ErrTargetNotEnough, "reorder"); | |
273 | dend -= UTF8_MAXLEN; /* safety */ | |
274 | ||
275 | while (p < e) { | |
276 | U8 curCC; | |
277 | STRLEN retlen; | |
278 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); | |
279 | if (!retlen) | |
280 | croak(ErrRetlenIsZero, "reorder"); | |
281 | p += retlen; | |
ac5ea531 | 282 | |
fe067ad9 | 283 | curCC = getCombinClass(uv); |
ac5ea531 | 284 | |
fe067ad9 SP |
285 | if (curCC != 0) { |
286 | if (seq_max < cc_pos + 1) { /* extend if need */ | |
287 | seq_max = cc_pos + CC_SEQ_STEP; /* new size */ | |
288 | if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ | |
289 | STRLEN i; | |
290 | New(0, seq_ext, seq_max, UNF_cc); | |
291 | for (i = 0; i < cc_pos; i++) | |
292 | seq_ext[i] = seq_ary[i]; | |
293 | } | |
294 | else { | |
295 | Renew(seq_ext, seq_max, UNF_cc); | |
296 | } | |
39f4556f | 297 | seq_ptr = seq_ext; /* use seq_ext from now */ |
fe067ad9 | 298 | } |
a092bcfd | 299 | |
fe067ad9 SP |
300 | seq_ptr[cc_pos].cc = curCC; |
301 | seq_ptr[cc_pos].uv = uv; | |
302 | seq_ptr[cc_pos].pos = cc_pos; | |
303 | ++cc_pos; | |
ac5ea531 | 304 | |
fe067ad9 SP |
305 | if (p < e) |
306 | continue; | |
307 | } | |
ac5ea531 | 308 | |
fe067ad9 SP |
309 | if (cc_pos) { |
310 | STRLEN i; | |
311 | ||
312 | if (cc_pos > 1) /* reordered if there are two c.c.'s */ | |
313 | qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); | |
314 | ||
315 | for (i = 0; i < cc_pos; i++) { | |
316 | d = uvuni_to_utf8(d, seq_ptr[i].uv); | |
317 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ | |
318 | croak(ErrLongerThanSrc, "reorder"); | |
319 | } | |
320 | cc_pos = 0; | |
321 | } | |
2a204b45 | 322 | |
e524f5b2 NC |
323 | if (curCC == 0) { |
324 | d = uvuni_to_utf8(d, uv); | |
fe067ad9 SP |
325 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ |
326 | croak(ErrLongerThanSrc, "reorder"); | |
e524f5b2 | 327 | } |
fe067ad9 SP |
328 | } |
329 | if (seq_ext) | |
330 | Safefree(seq_ext); | |
331 | return d; | |
332 | } | |
ac5ea531 | 333 | |
fe067ad9 SP |
334 | static |
335 | U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig) | |
336 | { | |
337 | U8* p = s; | |
338 | U8* e = s + slen; | |
339 | U8* dend = d + dlen; | |
340 | ||
39f4556f | 341 | UV uvS = 0; /* code point of the starter */ |
fe067ad9 SP |
342 | bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ |
343 | U8 preCC = 0; | |
344 | ||
345 | UV seq_ary[CC_SEQ_SIZE]; | |
346 | UV* seq_ptr = seq_ary; /* use array at the beginning */ | |
347 | UV* seq_ext = NULL; /* extend if need */ | |
348 | STRLEN seq_max = CC_SEQ_SIZE; | |
349 | STRLEN cc_pos = 0; | |
350 | ||
351 | if (dlen < slen || dlen < slen + UTF8_MAXLEN) | |
352 | croak(ErrTargetNotEnough, "compose"); | |
353 | dend -= UTF8_MAXLEN; /* safety */ | |
354 | ||
355 | while (p < e) { | |
356 | U8 curCC; | |
357 | STRLEN retlen; | |
358 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); | |
359 | if (!retlen) | |
360 | croak(ErrRetlenIsZero, "compose"); | |
361 | p += retlen; | |
ac5ea531 | 362 | |
fe067ad9 | 363 | curCC = getCombinClass(uv); |
82e740b6 | 364 | |
fe067ad9 | 365 | if (!valid_uvS) { |
e524f5b2 | 366 | if (curCC == 0) { |
fe067ad9 SP |
367 | uvS = uv; /* the first Starter is found */ |
368 | valid_uvS = TRUE; | |
369 | if (p < e) | |
370 | continue; | |
e524f5b2 | 371 | } |
fe067ad9 SP |
372 | else { |
373 | d = uvuni_to_utf8(d, uv); | |
374 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ | |
375 | croak(ErrLongerThanSrc, "compose"); | |
376 | continue; | |
377 | } | |
378 | } | |
379 | else { | |
380 | bool composed; | |
381 | ||
382 | /* blocked */ | |
383 | if (iscontig && cc_pos || /* discontiguous combination */ | |
384 | curCC != 0 && preCC == curCC || /* blocked by same CC */ | |
385 | preCC > curCC) /* blocked by higher CC: revised D2 */ | |
386 | composed = FALSE; | |
387 | ||
388 | /* not blocked: | |
389 | iscontig && cc_pos == 0 -- contiguous combination | |
390 | curCC == 0 && preCC == 0 -- starter + starter | |
391 | curCC != 0 && preCC < curCC -- lower CC */ | |
392 | else { | |
393 | /* try composition */ | |
394 | UV uvComp = composite_uv(uvS, uv); | |
395 | ||
396 | if (uvComp && !isExclusion(uvComp)) { | |
397 | uvS = uvComp; | |
398 | composed = TRUE; | |
82e740b6 | 399 | |
fe067ad9 SP |
400 | /* preCC should not be changed to curCC */ |
401 | /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ | |
402 | if (p < e) | |
403 | continue; | |
404 | } | |
405 | else | |
406 | composed = FALSE; | |
407 | } | |
408 | ||
409 | if (!composed) { | |
410 | preCC = curCC; | |
411 | if (curCC != 0 || !(p < e)) { | |
412 | if (seq_max < cc_pos + 1) { /* extend if need */ | |
413 | seq_max = cc_pos + CC_SEQ_STEP; /* new size */ | |
414 | if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ | |
415 | New(0, seq_ext, seq_max, UV); | |
416 | Copy(seq_ary, seq_ext, cc_pos, UV); | |
417 | } | |
418 | else { | |
419 | Renew(seq_ext, seq_max, UV); | |
420 | } | |
2b8d773d | 421 | seq_ptr = seq_ext; /* use seq_ext from now */ |
fe067ad9 SP |
422 | } |
423 | seq_ptr[cc_pos] = uv; | |
424 | ++cc_pos; | |
425 | } | |
426 | if (curCC != 0 && p < e) | |
427 | continue; | |
ac5ea531 | 428 | } |
ac5ea531 JH |
429 | } |
430 | ||
fe067ad9 SP |
431 | d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ |
432 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ | |
433 | croak(ErrLongerThanSrc, "compose"); | |
434 | ||
e524f5b2 | 435 | if (cc_pos) { |
fe067ad9 | 436 | STRLEN i; |
ac5ea531 | 437 | |
fe067ad9 SP |
438 | for (i = 0; i < cc_pos; i++) { |
439 | d = uvuni_to_utf8(d, seq_ptr[i]); | |
440 | if (dend < d) /* real end is dend + UTF8_MAXLEN */ | |
441 | croak(ErrLongerThanSrc, "compose"); | |
442 | } | |
443 | cc_pos = 0; | |
ac5ea531 | 444 | } |
fe067ad9 SP |
445 | |
446 | uvS = uv; | |
ac5ea531 | 447 | } |
fe067ad9 SP |
448 | if (seq_ext) |
449 | Safefree(seq_ext); | |
450 | return d; | |
451 | } | |
452 | ||
453 | MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize | |
454 | ||
455 | SV* | |
456 | decompose(src, compat = &PL_sv_no) | |
457 | SV * src | |
458 | SV * compat | |
459 | PROTOTYPE: $;$ | |
460 | PREINIT: | |
461 | SV* dst; | |
462 | U8 *s, *d, *dend; | |
463 | STRLEN slen, dlen; | |
464 | CODE: | |
465 | s = (U8*)sv_2pvunicode(src,&slen); | |
466 | dst = newSVpvn("", 0); | |
467 | dlen = slen; | |
468 | New(0, d, dlen+1, U8); | |
39f4556f | 469 | dend = pv_utf8_decompose(s, slen, &d, dlen, (bool)SvTRUE(compat)); |
9b374906 | 470 | sv_setpvn(dst, (char *)d, dend - d); |
fe067ad9 SP |
471 | SvUTF8_on(dst); |
472 | Safefree(d); | |
8f118dcd | 473 | RETVAL = dst; |
ac5ea531 JH |
474 | OUTPUT: |
475 | RETVAL | |
476 | ||
fe067ad9 SP |
477 | SV* |
478 | reorder(src) | |
479 | SV * src | |
480 | PROTOTYPE: $ | |
481 | PREINIT: | |
482 | SV* dst; | |
483 | U8 *s, *d, *dend; | |
484 | STRLEN slen, dlen; | |
485 | CODE: | |
486 | s = (U8*)sv_2pvunicode(src,&slen); | |
487 | dst = newSVpvn("", 0); | |
488 | dlen = slen + UTF8_MAXLEN; | |
489 | d = (U8*)SvGROW(dst,dlen+1); | |
490 | SvUTF8_on(dst); | |
491 | dend = pv_utf8_reorder(s, slen, d, dlen); | |
492 | *dend = '\0'; | |
493 | SvCUR_set(dst, dend - d); | |
494 | RETVAL = dst; | |
495 | OUTPUT: | |
496 | RETVAL | |
ac5ea531 | 497 | |
2a204b45 | 498 | SV* |
a092bcfd RGS |
499 | compose(src) |
500 | SV * src | |
ac5ea531 | 501 | PROTOTYPE: $ |
82e740b6 NC |
502 | ALIAS: |
503 | composeContiguous = 1 | |
ac5ea531 | 504 | PREINIT: |
fe067ad9 SP |
505 | SV* dst; |
506 | U8 *s, *d, *dend; | |
507 | STRLEN slen, dlen; | |
2a204b45 | 508 | CODE: |
fe067ad9 SP |
509 | s = (U8*)sv_2pvunicode(src,&slen); |
510 | dst = newSVpvn("", 0); | |
511 | dlen = slen + UTF8_MAXLEN; | |
512 | d = (U8*)SvGROW(dst,dlen+1); | |
ac5ea531 | 513 | SvUTF8_on(dst); |
fe067ad9 SP |
514 | dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix); |
515 | *dend = '\0'; | |
516 | SvCUR_set(dst, dend - d); | |
517 | RETVAL = dst; | |
518 | OUTPUT: | |
519 | RETVAL | |
ac5ea531 | 520 | |
fe067ad9 SP |
521 | SV* |
522 | NFD(src) | |
523 | SV * src | |
524 | PROTOTYPE: $ | |
525 | ALIAS: | |
526 | NFKD = 1 | |
527 | PREINIT: | |
528 | SV *dst; | |
529 | U8 *s, *t, *tend, *d, *dend; | |
530 | STRLEN slen, tlen, dlen; | |
531 | CODE: | |
532 | /* decompose */ | |
533 | s = (U8*)sv_2pvunicode(src,&slen); | |
534 | tlen = slen; | |
535 | New(0, t, tlen+1, U8); | |
536 | tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix); | |
537 | *tend = '\0'; | |
538 | tlen = tend - t; /* no longer know real tlen */ | |
539 | ||
540 | /* reorder */ | |
541 | dst = newSVpvn("", 0); | |
542 | dlen = tlen + UTF8_MAXLEN; | |
543 | d = (U8*)SvGROW(dst,dlen+1); | |
544 | SvUTF8_on(dst); | |
545 | dend = pv_utf8_reorder(t, tlen, d, dlen); | |
546 | *dend = '\0'; | |
547 | SvCUR_set(dst, dend - d); | |
ac5ea531 | 548 | |
fe067ad9 SP |
549 | /* return */ |
550 | Safefree(t); | |
551 | RETVAL = dst; | |
552 | OUTPUT: | |
553 | RETVAL | |
82e740b6 | 554 | |
fe067ad9 SP |
555 | SV* |
556 | NFC(src) | |
557 | SV * src | |
558 | PROTOTYPE: $ | |
559 | ALIAS: | |
560 | NFKC = 1 | |
561 | FCC = 2 | |
562 | PREINIT: | |
563 | SV *dst; | |
564 | U8 *s, *t, *tend, *u, *uend, *d, *dend; | |
565 | STRLEN slen, tlen, ulen, dlen; | |
566 | CODE: | |
567 | /* decompose */ | |
568 | s = (U8*)sv_2pvunicode(src,&slen); | |
569 | tlen = slen; | |
570 | New(0, t, tlen+1, U8); | |
571 | tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1)); | |
572 | *tend = '\0'; | |
573 | tlen = tend - t; /* no longer know real tlen */ | |
574 | ||
575 | /* reorder */ | |
576 | ulen = tlen + UTF8_MAXLEN; | |
577 | New(0, u, ulen+1, U8); | |
578 | uend = pv_utf8_reorder(t, tlen, u, ulen); | |
579 | *uend = '\0'; | |
580 | ulen = uend - u; | |
581 | ||
582 | /* compose */ | |
583 | dst = newSVpvn("", 0); | |
584 | dlen = ulen + UTF8_MAXLEN; | |
585 | d = (U8*)SvGROW(dst,dlen+1); | |
586 | SvUTF8_on(dst); | |
587 | dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2)); | |
588 | *dend = '\0'; | |
589 | SvCUR_set(dst, dend - d); | |
ac5ea531 | 590 | |
fe067ad9 SP |
591 | /* return */ |
592 | Safefree(t); | |
593 | Safefree(u); | |
2a204b45 JH |
594 | RETVAL = dst; |
595 | OUTPUT: | |
596 | RETVAL | |
ac5ea531 | 597 | |
2b8d773d | 598 | SV* |
a092bcfd RGS |
599 | checkNFD(src) |
600 | SV * src | |
8f118dcd JH |
601 | PROTOTYPE: $ |
602 | ALIAS: | |
603 | checkNFKD = 1 | |
604 | PREINIT: | |
8f118dcd JH |
605 | STRLEN srclen, retlen; |
606 | U8 *s, *e, *p, curCC, preCC; | |
2b8d773d | 607 | bool result = TRUE; |
82e740b6 | 608 | CODE: |
a092bcfd | 609 | s = (U8*)sv_2pvunicode(src,&srclen); |
8f118dcd JH |
610 | e = s + srclen; |
611 | ||
612 | preCC = 0; | |
613 | for (p = s; p < e; p += retlen) { | |
fe067ad9 | 614 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 | 615 | if (!retlen) |
fe067ad9 | 616 | croak(ErrRetlenIsZero, "checkNFD or -NFKD"); |
82e740b6 | 617 | |
8f118dcd | 618 | curCC = getCombinClass(uv); |
2b8d773d RGS |
619 | if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ |
620 | result = FALSE; | |
621 | break; | |
622 | } | |
623 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { | |
624 | result = FALSE; | |
625 | break; | |
626 | } | |
8f118dcd JH |
627 | preCC = curCC; |
628 | } | |
2b8d773d RGS |
629 | RETVAL = boolSV(result); |
630 | OUTPUT: | |
631 | RETVAL | |
8f118dcd JH |
632 | |
633 | ||
2b8d773d | 634 | SV* |
a092bcfd RGS |
635 | checkNFC(src) |
636 | SV * src | |
8f118dcd JH |
637 | PROTOTYPE: $ |
638 | ALIAS: | |
639 | checkNFKC = 1 | |
640 | PREINIT: | |
8f118dcd JH |
641 | STRLEN srclen, retlen; |
642 | U8 *s, *e, *p, curCC, preCC; | |
2b8d773d RGS |
643 | bool result = TRUE; |
644 | bool isMAYBE = FALSE; | |
82e740b6 | 645 | CODE: |
a092bcfd | 646 | s = (U8*)sv_2pvunicode(src,&srclen); |
8f118dcd JH |
647 | e = s + srclen; |
648 | ||
649 | preCC = 0; | |
8f118dcd | 650 | for (p = s; p < e; p += retlen) { |
fe067ad9 | 651 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 | 652 | if (!retlen) |
fe067ad9 | 653 | croak(ErrRetlenIsZero, "checkNFC or -NFKC"); |
82e740b6 | 654 | |
8f118dcd | 655 | curCC = getCombinClass(uv); |
2b8d773d RGS |
656 | if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ |
657 | result = FALSE; | |
658 | break; | |
659 | } | |
8f118dcd JH |
660 | |
661 | /* get NFC/NFKC property */ | |
662 | if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ | |
663 | ; /* YES */ | |
2b8d773d RGS |
664 | else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { |
665 | result = FALSE; | |
666 | break; | |
667 | } | |
8f118dcd JH |
668 | else if (isComp2nd(uv)) |
669 | isMAYBE = TRUE; | |
670 | else if (ix) { | |
671 | char *canon, *compat; | |
6c941e0c | 672 | /* NFKC_NO when having compatibility mapping. */ |
8f118dcd JH |
673 | canon = (char *) dec_canonical(uv); |
674 | compat = (char *) dec_compat(uv); | |
2b8d773d RGS |
675 | if (compat && !(canon && strEQ(canon, compat))) { |
676 | result = FALSE; | |
677 | break; | |
678 | } | |
8f118dcd JH |
679 | } /* end of get NFC/NFKC property */ |
680 | ||
681 | preCC = curCC; | |
682 | } | |
2b8d773d | 683 | if (isMAYBE && result) /* NO precedes MAYBE */ |
8f118dcd | 684 | XSRETURN_UNDEF; |
2b8d773d RGS |
685 | RETVAL = boolSV(result); |
686 | OUTPUT: | |
687 | RETVAL | |
8f118dcd JH |
688 | |
689 | ||
2b8d773d | 690 | SV* |
a092bcfd RGS |
691 | checkFCD(src) |
692 | SV * src | |
82e740b6 NC |
693 | PROTOTYPE: $ |
694 | ALIAS: | |
695 | checkFCC = 1 | |
696 | PREINIT: | |
fe067ad9 | 697 | STRLEN srclen, retlen; |
82e740b6 | 698 | U8 *s, *e, *p, curCC, preCC; |
2b8d773d RGS |
699 | bool result = TRUE; |
700 | bool isMAYBE = FALSE; | |
82e740b6 | 701 | CODE: |
a092bcfd | 702 | s = (U8*)sv_2pvunicode(src,&srclen); |
82e740b6 | 703 | e = s + srclen; |
82e740b6 | 704 | preCC = 0; |
82e740b6 | 705 | for (p = s; p < e; p += retlen) { |
fe067ad9 SP |
706 | U8 *sCan; |
707 | UV uvLead; | |
39f4556f | 708 | STRLEN canlen = 0; |
fe067ad9 | 709 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 | 710 | if (!retlen) |
fe067ad9 | 711 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
82e740b6 NC |
712 | |
713 | sCan = (U8*) dec_canonical(uv); | |
714 | ||
715 | if (sCan) { | |
39f4556f | 716 | STRLEN canret; |
82e740b6 | 717 | canlen = (STRLEN)strlen((char *) sCan); |
e524f5b2 | 718 | uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF); |
fe067ad9 SP |
719 | if (!canret) |
720 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); | |
82e740b6 NC |
721 | } |
722 | else { | |
723 | uvLead = uv; | |
724 | } | |
725 | ||
726 | curCC = getCombinClass(uvLead); | |
727 | ||
2b8d773d RGS |
728 | if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ |
729 | result = FALSE; | |
730 | break; | |
731 | } | |
82e740b6 NC |
732 | |
733 | if (ix) { | |
2b8d773d RGS |
734 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { |
735 | result = FALSE; | |
736 | break; | |
737 | } | |
82e740b6 NC |
738 | else if (isComp2nd(uv)) |
739 | isMAYBE = TRUE; | |
740 | } | |
741 | ||
742 | if (sCan) { | |
39f4556f | 743 | STRLEN canret; |
fe067ad9 SP |
744 | UV uvTrail; |
745 | U8* eCan = sCan + canlen; | |
746 | U8* pCan = utf8_hop(eCan, -1); | |
82e740b6 NC |
747 | if (pCan < sCan) |
748 | croak(ErrHopBeforeStart); | |
e524f5b2 | 749 | uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF); |
fe067ad9 SP |
750 | if (!canret) |
751 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); | |
82e740b6 NC |
752 | preCC = getCombinClass(uvTrail); |
753 | } | |
754 | else { | |
755 | preCC = curCC; | |
756 | } | |
757 | } | |
2b8d773d | 758 | if (isMAYBE && result) /* NO precedes MAYBE */ |
82e740b6 | 759 | XSRETURN_UNDEF; |
2b8d773d RGS |
760 | RETVAL = boolSV(result); |
761 | OUTPUT: | |
762 | RETVAL | |
82e740b6 NC |
763 | |
764 | ||
ac5ea531 JH |
765 | U8 |
766 | getCombinClass(uv) | |
767 | UV uv | |
8f118dcd | 768 | PROTOTYPE: $ |
ac5ea531 JH |
769 | |
770 | bool | |
2a204b45 | 771 | isExclusion(uv) |
ac5ea531 | 772 | UV uv |
8f118dcd JH |
773 | PROTOTYPE: $ |
774 | ||
775 | bool | |
776 | isSingleton(uv) | |
777 | UV uv | |
778 | PROTOTYPE: $ | |
779 | ||
780 | bool | |
781 | isNonStDecomp(uv) | |
782 | UV uv | |
783 | PROTOTYPE: $ | |
784 | ||
785 | bool | |
786 | isComp2nd(uv) | |
787 | UV uv | |
788 | PROTOTYPE: $ | |
789 | ALIAS: | |
790 | isNFC_MAYBE = 1 | |
791 | isNFKC_MAYBE = 2 | |
792 | ||
793 | ||
794 | ||
2b8d773d | 795 | SV* |
8f118dcd JH |
796 | isNFD_NO(uv) |
797 | UV uv | |
798 | PROTOTYPE: $ | |
799 | ALIAS: | |
800 | isNFKD_NO = 1 | |
2b8d773d RGS |
801 | PREINIT: |
802 | bool result = FALSE; | |
82e740b6 | 803 | CODE: |
8f118dcd | 804 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) |
2b8d773d RGS |
805 | result = TRUE; /* NFD_NO or NFKD_NO */ |
806 | RETVAL = boolSV(result); | |
807 | OUTPUT: | |
808 | RETVAL | |
8f118dcd JH |
809 | |
810 | ||
2b8d773d | 811 | SV* |
8f118dcd JH |
812 | isComp_Ex(uv) |
813 | UV uv | |
814 | PROTOTYPE: $ | |
815 | ALIAS: | |
816 | isNFC_NO = 0 | |
817 | isNFKC_NO = 1 | |
2b8d773d RGS |
818 | PREINIT: |
819 | bool result = FALSE; | |
82e740b6 | 820 | CODE: |
8f118dcd | 821 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
2b8d773d | 822 | result = TRUE; /* NFC_NO or NFKC_NO */ |
8f118dcd JH |
823 | else if (ix) { |
824 | char *canon, *compat; | |
825 | canon = (char *) dec_canonical(uv); | |
826 | compat = (char *) dec_compat(uv); | |
827 | if (compat && (!canon || strNE(canon, compat))) | |
2b8d773d | 828 | result = TRUE; /* NFC_NO or NFKC_NO */ |
8f118dcd | 829 | } |
2b8d773d RGS |
830 | RETVAL = boolSV(result); |
831 | OUTPUT: | |
832 | RETVAL | |
ac5ea531 | 833 | |
2a204b45 | 834 | SV* |
ac5ea531 JH |
835 | getComposite(uv, uv2) |
836 | UV uv | |
837 | UV uv2 | |
2a204b45 JH |
838 | PROTOTYPE: $$ |
839 | PREINIT: | |
bcdb689b | 840 | UV composite; |
2a204b45 | 841 | CODE: |
bcdb689b JH |
842 | composite = composite_uv(uv, uv2); |
843 | RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; | |
2a204b45 JH |
844 | OUTPUT: |
845 | RETVAL | |
ac5ea531 | 846 | |
8f118dcd JH |
847 | |
848 | ||
ac5ea531 JH |
849 | SV* |
850 | getCanon(uv) | |
851 | UV uv | |
852 | PROTOTYPE: $ | |
853 | ALIAS: | |
854 | getCompat = 1 | |
ac5ea531 | 855 | CODE: |
8f118dcd | 856 | if (Hangul_IsS(uv)) { |
fe067ad9 SP |
857 | U8 tmp[3 * UTF8_MAXLEN + 1]; |
858 | U8 *t = tmp; | |
859 | U8 *e = pv_cat_decompHangul(t, uv); | |
860 | RETVAL = newSVpvn((char *)t, e - t); | |
ac5ea531 | 861 | } else { |
fe067ad9 | 862 | U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); |
8f118dcd JH |
863 | if (!rstr) |
864 | XSRETURN_UNDEF; | |
ac5ea531 JH |
865 | RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); |
866 | } | |
867 | SvUTF8_on(RETVAL); | |
868 | OUTPUT: | |
869 | RETVAL | |
870 | ||
82e740b6 NC |
871 | |
872 | void | |
a092bcfd RGS |
873 | splitOnLastStarter(src) |
874 | SV * src | |
82e740b6 | 875 | PREINIT: |
a092bcfd | 876 | SV *svp; |
fe067ad9 | 877 | STRLEN srclen; |
82e740b6 NC |
878 | U8 *s, *e, *p; |
879 | PPCODE: | |
a092bcfd | 880 | s = (U8*)sv_2pvunicode(src,&srclen); |
82e740b6 | 881 | e = s + srclen; |
fe067ad9 SP |
882 | p = e; |
883 | while (s < p) { | |
884 | UV uv; | |
82e740b6 NC |
885 | p = utf8_hop(p, -1); |
886 | if (p < s) | |
887 | croak(ErrHopBeforeStart); | |
fe067ad9 | 888 | uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF); |
82e740b6 NC |
889 | if (getCombinClass(uv) == 0) /* Last Starter found */ |
890 | break; | |
891 | } | |
892 | ||
893 | svp = sv_2mortal(newSVpvn((char*)s, p - s)); | |
894 | SvUTF8_on(svp); | |
895 | XPUSHs(svp); | |
896 | ||
897 | svp = sv_2mortal(newSVpvn((char*)p, e - p)); | |
898 | SvUTF8_on(svp); | |
899 | XPUSHs(svp); | |
900 |