Commit | Line | Data |
---|---|---|
ac5ea531 | 1 | |
a257baa9 CBW |
2 | #define PERL_NO_GET_CONTEXT /* we want efficiency */ |
3 | ||
4 | /* private functions which need pTHX_ and aTHX_ | |
5 | pv_cat_decompHangul | |
6 | sv_2pvunicode | |
7 | pv_utf8_decompose | |
8 | pv_utf8_reorder | |
9 | pv_utf8_compose | |
10 | */ | |
11 | ||
ac5ea531 JH |
12 | #include "EXTERN.h" |
13 | #include "perl.h" | |
14 | #include "XSUB.h" | |
15 | ||
16 | /* These 5 files are prepared by mkheader */ | |
17 | #include "unfcmb.h" | |
18 | #include "unfcan.h" | |
19 | #include "unfcpt.h" | |
20 | #include "unfcmp.h" | |
21 | #include "unfexc.h" | |
22 | ||
23 | /* Perl 5.6.1 ? */ | |
24 | #ifndef uvuni_to_utf8 | |
25 | #define uvuni_to_utf8 uv_to_utf8 | |
6c941e0c | 26 | #endif /* uvuni_to_utf8 */ |
ac5ea531 JH |
27 | |
28 | /* Perl 5.6.1 ? */ | |
ab8fe378 | 29 | #ifndef utf8n_to_uvuni |
788ba0f4 | 30 | #define utf8n_to_uvuni utf8_to_uv |
6c941e0c | 31 | #endif /* utf8n_to_uvuni */ |
ac5ea531 | 32 | |
e524f5b2 | 33 | /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ |
788ba0f4 CBW |
34 | #ifndef UTF8_ALLOW_BOM |
35 | #define UTF8_ALLOW_BOM (0) | |
36 | #endif /* UTF8_ALLOW_BOM */ | |
37 | ||
38 | #ifndef UTF8_ALLOW_SURROGATE | |
39 | #define UTF8_ALLOW_SURROGATE (0) | |
40 | #endif /* UTF8_ALLOW_SURROGATE */ | |
41 | ||
42 | #ifndef UTF8_ALLOW_FE_FF | |
43 | #define UTF8_ALLOW_FE_FF (0) | |
44 | #endif /* UTF8_ALLOW_FE_FF */ | |
45 | ||
46 | #ifndef UTF8_ALLOW_FFFF | |
47 | #define UTF8_ALLOW_FFFF (0) | |
48 | #endif /* UTF8_ALLOW_FFFF */ | |
49 | ||
50 | #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF) | |
51 | ||
52 | /* check if the string buffer is enough before uvuni_to_utf8(). */ | |
53 | /* dstart, d, and dlen should be defined outside before. */ | |
54 | #define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \ | |
55 | if (dlen < curlen + (need)) { \ | |
56 | dlen += (need); \ | |
57 | Renew(dstart, dlen+1, U8); \ | |
58 | d = dstart + curlen; \ | |
59 | } | |
e524f5b2 | 60 | |
788ba0f4 | 61 | /* if utf8n_to_uvuni() sets retlen to 0 (if broken?) */ |
fe067ad9 | 62 | #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" |
82e740b6 NC |
63 | |
64 | /* utf8_hop() hops back before start. Maybe broken UTF-8 */ | |
65 | #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" | |
66 | ||
ac5ea531 JH |
67 | /* At present, char > 0x10ffff are unaffected without complaint, right? */ |
68 | #define VALID_UTF_MAX (0x10ffff) | |
69 | #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) | |
70 | ||
fe067ad9 SP |
71 | /* size of array for combining characters */ |
72 | /* enough as an initial value? */ | |
73 | #define CC_SEQ_SIZE (10) | |
74 | #define CC_SEQ_STEP (5) | |
75 | ||
76 | /* HANGUL begin */ | |
ac5ea531 JH |
77 | #define Hangul_SBase 0xAC00 |
78 | #define Hangul_SFinal 0xD7A3 | |
79 | #define Hangul_SCount 11172 | |
80 | ||
81 | #define Hangul_NCount 588 | |
82 | ||
83 | #define Hangul_LBase 0x1100 | |
84 | #define Hangul_LFinal 0x1112 | |
85 | #define Hangul_LCount 19 | |
86 | ||
87 | #define Hangul_VBase 0x1161 | |
88 | #define Hangul_VFinal 0x1175 | |
89 | #define Hangul_VCount 21 | |
90 | ||
91 | #define Hangul_TBase 0x11A7 | |
92 | #define Hangul_TFinal 0x11C2 | |
93 | #define Hangul_TCount 28 | |
94 | ||
95 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) | |
2a204b45 | 96 | #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) |
ac5ea531 JH |
97 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) |
98 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) | |
99 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) | |
100 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) | |
fe067ad9 | 101 | /* HANGUL end */ |
ac5ea531 JH |
102 | |
103 | /* this is used for canonical ordering of combining characters (c.c.). */ | |
104 | typedef struct { | |
105 | U8 cc; /* combining class */ | |
106 | UV uv; /* codepoint */ | |
107 | STRLEN pos; /* position */ | |
108 | } UNF_cc; | |
109 | ||
fe067ad9 | 110 | static int compare_cc(const void *a, const void *b) |
ac5ea531 JH |
111 | { |
112 | int ret_cc; | |
6c941e0c | 113 | ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; |
8f118dcd JH |
114 | if (ret_cc) |
115 | return ret_cc; | |
6c941e0c JH |
116 | |
117 | return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) | |
118 | - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); | |
ac5ea531 JH |
119 | } |
120 | ||
fe067ad9 | 121 | static U8* dec_canonical(UV uv) |
ac5ea531 JH |
122 | { |
123 | U8 ***plane, **row; | |
8f118dcd JH |
124 | if (OVER_UTF_MAX(uv)) |
125 | return NULL; | |
ac5ea531 | 126 | plane = (U8***)UNF_canon[uv >> 16]; |
8f118dcd JH |
127 | if (! plane) |
128 | return NULL; | |
ac5ea531 JH |
129 | row = plane[(uv >> 8) & 0xff]; |
130 | return row ? row[uv & 0xff] : NULL; | |
131 | } | |
132 | ||
fe067ad9 | 133 | static U8* dec_compat(UV uv) |
ac5ea531 JH |
134 | { |
135 | U8 ***plane, **row; | |
8f118dcd JH |
136 | if (OVER_UTF_MAX(uv)) |
137 | return NULL; | |
ac5ea531 | 138 | plane = (U8***)UNF_compat[uv >> 16]; |
8f118dcd JH |
139 | if (! plane) |
140 | return NULL; | |
ac5ea531 JH |
141 | row = plane[(uv >> 8) & 0xff]; |
142 | return row ? row[uv & 0xff] : NULL; | |
143 | } | |
144 | ||
fe067ad9 | 145 | static UV composite_uv(UV uv, UV uv2) |
ac5ea531 JH |
146 | { |
147 | UNF_complist ***plane, **row, *cell, *i; | |
148 | ||
fe067ad9 | 149 | if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) |
8f118dcd | 150 | return 0; |
ac5ea531 | 151 | |
8f118dcd | 152 | if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { |
fe067ad9 SP |
153 | UV lindex = uv - Hangul_LBase; |
154 | UV vindex = uv2 - Hangul_VBase; | |
155 | return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * | |
156 | Hangul_TCount); | |
ac5ea531 | 157 | } |
8f118dcd | 158 | if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { |
fe067ad9 SP |
159 | UV tindex = uv2 - Hangul_TBase; |
160 | return(uv + tindex); | |
ac5ea531 JH |
161 | } |
162 | plane = UNF_compos[uv >> 16]; | |
8f118dcd JH |
163 | if (! plane) |
164 | return 0; | |
ac5ea531 | 165 | row = plane[(uv >> 8) & 0xff]; |
8f118dcd JH |
166 | if (! row) |
167 | return 0; | |
ac5ea531 | 168 | cell = row[uv & 0xff]; |
8f118dcd JH |
169 | if (! cell) |
170 | return 0; | |
171 | for (i = cell; i->nextchar; i++) { | |
172 | if (uv2 == i->nextchar) | |
173 | return i->composite; | |
ac5ea531 JH |
174 | } |
175 | return 0; | |
176 | } | |
177 | ||
fe067ad9 | 178 | static U8 getCombinClass(UV uv) |
ac5ea531 JH |
179 | { |
180 | U8 **plane, *row; | |
8f118dcd JH |
181 | if (OVER_UTF_MAX(uv)) |
182 | return 0; | |
ac5ea531 | 183 | plane = (U8**)UNF_combin[uv >> 16]; |
8f118dcd JH |
184 | if (! plane) |
185 | return 0; | |
ac5ea531 JH |
186 | row = plane[(uv >> 8) & 0xff]; |
187 | return row ? row[uv & 0xff] : 0; | |
188 | } | |
189 | ||
a257baa9 | 190 | static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv) |
ac5ea531 | 191 | { |
fe067ad9 SP |
192 | UV sindex = uv - Hangul_SBase; |
193 | UV lindex = sindex / Hangul_NCount; | |
194 | UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; | |
195 | UV tindex = sindex % Hangul_TCount; | |
ac5ea531 | 196 | |
8f118dcd | 197 | if (! Hangul_IsS(uv)) |
fe067ad9 | 198 | return d; |
ac5ea531 | 199 | |
fe067ad9 SP |
200 | d = uvuni_to_utf8(d, (lindex + Hangul_LBase)); |
201 | d = uvuni_to_utf8(d, (vindex + Hangul_VBase)); | |
8f118dcd | 202 | if (tindex) |
fe067ad9 SP |
203 | d = uvuni_to_utf8(d, (tindex + Hangul_TBase)); |
204 | return d; | |
ac5ea531 JH |
205 | } |
206 | ||
a257baa9 | 207 | static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp) |
a092bcfd RGS |
208 | { |
209 | char *s; | |
210 | STRLEN len; | |
39f4556f | 211 | s = SvPV(sv,len); |
a092bcfd | 212 | if (!SvUTF8(sv)) { |
39f4556f | 213 | SV* tmpsv = sv_2mortal(newSVpvn(s, len)); |
a092bcfd | 214 | if (!SvPOK(tmpsv)) |
39f4556f | 215 | s = SvPV_force(tmpsv,len); |
a092bcfd | 216 | sv_utf8_upgrade(tmpsv); |
39f4556f | 217 | s = SvPV(tmpsv,len); |
a092bcfd | 218 | } |
fe067ad9 SP |
219 | if (lp) |
220 | *lp = len; | |
a092bcfd RGS |
221 | return s; |
222 | } | |
223 | ||
fe067ad9 | 224 | static |
a257baa9 | 225 | U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) |
fe067ad9 SP |
226 | { |
227 | U8* p = s; | |
228 | U8* e = s + slen; | |
229 | U8* dstart = *dp; | |
230 | U8* d = dstart; | |
231 | ||
232 | while (p < e) { | |
233 | STRLEN retlen; | |
234 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); | |
82e740b6 | 235 | if (!retlen) |
fe067ad9 SP |
236 | croak(ErrRetlenIsZero, "decompose"); |
237 | p += retlen; | |
238 | ||
239 | if (Hangul_IsS(uv)) { | |
788ba0f4 | 240 | Renew_d_if_not_enough_to(UTF8_MAXLEN * 3) |
a257baa9 | 241 | d = pv_cat_decompHangul(aTHX_ d, uv); |
fe067ad9 | 242 | } |
ac5ea531 | 243 | else { |
fe067ad9 SP |
244 | U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); |
245 | ||
246 | if (r) { | |
247 | STRLEN len = (STRLEN)strlen((char *)r); | |
788ba0f4 | 248 | Renew_d_if_not_enough_to(len) |
fe067ad9 SP |
249 | while (len--) |
250 | *d++ = *r++; | |
251 | } | |
252 | else { | |
788ba0f4 | 253 | Renew_d_if_not_enough_to(UTF8_MAXLEN) |
fe067ad9 SP |
254 | d = uvuni_to_utf8(d, uv); |
255 | } | |
ac5ea531 JH |
256 | } |
257 | } | |
fe067ad9 SP |
258 | *dp = dstart; |
259 | return d; | |
260 | } | |
ac5ea531 | 261 | |
fe067ad9 | 262 | static |
a257baa9 | 263 | U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen) |
fe067ad9 SP |
264 | { |
265 | U8* p = s; | |
266 | U8* e = s + slen; | |
788ba0f4 CBW |
267 | U8* dstart = *dp; |
268 | U8* d = dstart; | |
fe067ad9 SP |
269 | |
270 | UNF_cc seq_ary[CC_SEQ_SIZE]; | |
271 | UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ | |
272 | UNF_cc* seq_ext = NULL; /* extend if need */ | |
273 | STRLEN seq_max = CC_SEQ_SIZE; | |
274 | STRLEN cc_pos = 0; | |
275 | ||
fe067ad9 SP |
276 | while (p < e) { |
277 | U8 curCC; | |
278 | STRLEN retlen; | |
279 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); | |
280 | if (!retlen) | |
281 | croak(ErrRetlenIsZero, "reorder"); | |
282 | p += retlen; | |
ac5ea531 | 283 | |
fe067ad9 | 284 | curCC = getCombinClass(uv); |
ac5ea531 | 285 | |
fe067ad9 SP |
286 | if (curCC != 0) { |
287 | if (seq_max < cc_pos + 1) { /* extend if need */ | |
288 | seq_max = cc_pos + CC_SEQ_STEP; /* new size */ | |
289 | if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ | |
290 | STRLEN i; | |
291 | New(0, seq_ext, seq_max, UNF_cc); | |
292 | for (i = 0; i < cc_pos; i++) | |
293 | seq_ext[i] = seq_ary[i]; | |
294 | } | |
295 | else { | |
296 | Renew(seq_ext, seq_max, UNF_cc); | |
297 | } | |
39f4556f | 298 | seq_ptr = seq_ext; /* use seq_ext from now */ |
fe067ad9 | 299 | } |
a092bcfd | 300 | |
fe067ad9 SP |
301 | seq_ptr[cc_pos].cc = curCC; |
302 | seq_ptr[cc_pos].uv = uv; | |
303 | seq_ptr[cc_pos].pos = cc_pos; | |
304 | ++cc_pos; | |
ac5ea531 | 305 | |
fe067ad9 SP |
306 | if (p < e) |
307 | continue; | |
308 | } | |
ac5ea531 | 309 | |
788ba0f4 | 310 | /* output */ |
fe067ad9 SP |
311 | if (cc_pos) { |
312 | STRLEN i; | |
313 | ||
314 | if (cc_pos > 1) /* reordered if there are two c.c.'s */ | |
315 | qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); | |
316 | ||
317 | for (i = 0; i < cc_pos; i++) { | |
788ba0f4 | 318 | Renew_d_if_not_enough_to(UTF8_MAXLEN) |
fe067ad9 | 319 | d = uvuni_to_utf8(d, seq_ptr[i].uv); |
fe067ad9 SP |
320 | } |
321 | cc_pos = 0; | |
322 | } | |
2a204b45 | 323 | |
e524f5b2 | 324 | if (curCC == 0) { |
788ba0f4 | 325 | Renew_d_if_not_enough_to(UTF8_MAXLEN) |
e524f5b2 | 326 | d = uvuni_to_utf8(d, uv); |
e524f5b2 | 327 | } |
fe067ad9 SP |
328 | } |
329 | if (seq_ext) | |
330 | Safefree(seq_ext); | |
788ba0f4 | 331 | *dp = dstart; |
fe067ad9 SP |
332 | return d; |
333 | } | |
ac5ea531 | 334 | |
fe067ad9 | 335 | static |
a257baa9 | 336 | U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig) |
fe067ad9 SP |
337 | { |
338 | U8* p = s; | |
339 | U8* e = s + slen; | |
788ba0f4 CBW |
340 | U8* dstart = *dp; |
341 | U8* d = dstart; | |
fe067ad9 | 342 | |
39f4556f | 343 | UV uvS = 0; /* code point of the starter */ |
fe067ad9 SP |
344 | bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ |
345 | U8 preCC = 0; | |
346 | ||
347 | UV seq_ary[CC_SEQ_SIZE]; | |
348 | UV* seq_ptr = seq_ary; /* use array at the beginning */ | |
349 | UV* seq_ext = NULL; /* extend if need */ | |
350 | STRLEN seq_max = CC_SEQ_SIZE; | |
351 | STRLEN cc_pos = 0; | |
352 | ||
fe067ad9 SP |
353 | while (p < e) { |
354 | U8 curCC; | |
355 | STRLEN retlen; | |
356 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); | |
357 | if (!retlen) | |
358 | croak(ErrRetlenIsZero, "compose"); | |
359 | p += retlen; | |
ac5ea531 | 360 | |
fe067ad9 | 361 | curCC = getCombinClass(uv); |
82e740b6 | 362 | |
fe067ad9 | 363 | if (!valid_uvS) { |
e524f5b2 | 364 | if (curCC == 0) { |
fe067ad9 SP |
365 | uvS = uv; /* the first Starter is found */ |
366 | valid_uvS = TRUE; | |
367 | if (p < e) | |
368 | continue; | |
e524f5b2 | 369 | } |
fe067ad9 | 370 | else { |
788ba0f4 | 371 | Renew_d_if_not_enough_to(UTF8_MAXLEN) |
fe067ad9 | 372 | d = uvuni_to_utf8(d, uv); |
fe067ad9 SP |
373 | continue; |
374 | } | |
375 | } | |
376 | else { | |
377 | bool composed; | |
378 | ||
379 | /* blocked */ | |
c33e8be1 Z |
380 | if ((iscontig && cc_pos) || /* discontiguous combination */ |
381 | (curCC != 0 && preCC == curCC) || /* blocked by same CC */ | |
788ba0f4 | 382 | (preCC > curCC)) /* blocked by higher CC: revised D2 */ |
fe067ad9 SP |
383 | composed = FALSE; |
384 | ||
385 | /* not blocked: | |
386 | iscontig && cc_pos == 0 -- contiguous combination | |
387 | curCC == 0 && preCC == 0 -- starter + starter | |
388 | curCC != 0 && preCC < curCC -- lower CC */ | |
389 | else { | |
390 | /* try composition */ | |
391 | UV uvComp = composite_uv(uvS, uv); | |
392 | ||
393 | if (uvComp && !isExclusion(uvComp)) { | |
394 | uvS = uvComp; | |
395 | composed = TRUE; | |
82e740b6 | 396 | |
fe067ad9 SP |
397 | /* preCC should not be changed to curCC */ |
398 | /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ | |
399 | if (p < e) | |
400 | continue; | |
401 | } | |
402 | else | |
403 | composed = FALSE; | |
404 | } | |
405 | ||
406 | if (!composed) { | |
407 | preCC = curCC; | |
408 | if (curCC != 0 || !(p < e)) { | |
409 | if (seq_max < cc_pos + 1) { /* extend if need */ | |
410 | seq_max = cc_pos + CC_SEQ_STEP; /* new size */ | |
411 | if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ | |
412 | New(0, seq_ext, seq_max, UV); | |
413 | Copy(seq_ary, seq_ext, cc_pos, UV); | |
414 | } | |
415 | else { | |
416 | Renew(seq_ext, seq_max, UV); | |
417 | } | |
2b8d773d | 418 | seq_ptr = seq_ext; /* use seq_ext from now */ |
fe067ad9 SP |
419 | } |
420 | seq_ptr[cc_pos] = uv; | |
421 | ++cc_pos; | |
422 | } | |
423 | if (curCC != 0 && p < e) | |
424 | continue; | |
ac5ea531 | 425 | } |
ac5ea531 JH |
426 | } |
427 | ||
788ba0f4 CBW |
428 | /* output */ |
429 | { | |
430 | Renew_d_if_not_enough_to(UTF8_MAXLEN) | |
431 | d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ | |
432 | } | |
fe067ad9 | 433 | |
e524f5b2 | 434 | if (cc_pos) { |
fe067ad9 | 435 | STRLEN i; |
ac5ea531 | 436 | |
fe067ad9 | 437 | for (i = 0; i < cc_pos; i++) { |
788ba0f4 | 438 | Renew_d_if_not_enough_to(UTF8_MAXLEN) |
fe067ad9 | 439 | d = uvuni_to_utf8(d, seq_ptr[i]); |
fe067ad9 SP |
440 | } |
441 | cc_pos = 0; | |
ac5ea531 | 442 | } |
fe067ad9 SP |
443 | |
444 | uvS = uv; | |
ac5ea531 | 445 | } |
fe067ad9 SP |
446 | if (seq_ext) |
447 | Safefree(seq_ext); | |
788ba0f4 | 448 | *dp = dstart; |
fe067ad9 SP |
449 | return d; |
450 | } | |
451 | ||
452 | MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize | |
453 | ||
454 | SV* | |
455 | decompose(src, compat = &PL_sv_no) | |
456 | SV * src | |
457 | SV * compat | |
458 | PROTOTYPE: $;$ | |
459 | PREINIT: | |
460 | SV* dst; | |
461 | U8 *s, *d, *dend; | |
462 | STRLEN slen, dlen; | |
463 | CODE: | |
a257baa9 | 464 | s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
fe067ad9 SP |
465 | dst = newSVpvn("", 0); |
466 | dlen = slen; | |
467 | New(0, d, dlen+1, U8); | |
a257baa9 | 468 | dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat)); |
9b374906 | 469 | sv_setpvn(dst, (char *)d, dend - d); |
fe067ad9 SP |
470 | SvUTF8_on(dst); |
471 | Safefree(d); | |
8f118dcd | 472 | RETVAL = dst; |
ac5ea531 JH |
473 | OUTPUT: |
474 | RETVAL | |
475 | ||
788ba0f4 | 476 | |
fe067ad9 SP |
477 | SV* |
478 | reorder(src) | |
479 | SV * src | |
480 | PROTOTYPE: $ | |
481 | PREINIT: | |
482 | SV* dst; | |
483 | U8 *s, *d, *dend; | |
484 | STRLEN slen, dlen; | |
485 | CODE: | |
a257baa9 | 486 | s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
fe067ad9 | 487 | dst = newSVpvn("", 0); |
788ba0f4 CBW |
488 | dlen = slen; |
489 | New(0, d, dlen+1, U8); | |
a257baa9 | 490 | dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen); |
788ba0f4 | 491 | sv_setpvn(dst, (char *)d, dend - d); |
fe067ad9 | 492 | SvUTF8_on(dst); |
788ba0f4 | 493 | Safefree(d); |
fe067ad9 SP |
494 | RETVAL = dst; |
495 | OUTPUT: | |
496 | RETVAL | |
ac5ea531 | 497 | |
788ba0f4 | 498 | |
2a204b45 | 499 | SV* |
a092bcfd RGS |
500 | compose(src) |
501 | SV * src | |
ac5ea531 | 502 | PROTOTYPE: $ |
82e740b6 NC |
503 | ALIAS: |
504 | composeContiguous = 1 | |
ac5ea531 | 505 | PREINIT: |
fe067ad9 SP |
506 | SV* dst; |
507 | U8 *s, *d, *dend; | |
508 | STRLEN slen, dlen; | |
2a204b45 | 509 | CODE: |
a257baa9 | 510 | s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
fe067ad9 | 511 | dst = newSVpvn("", 0); |
788ba0f4 CBW |
512 | dlen = slen; |
513 | New(0, d, dlen+1, U8); | |
a257baa9 | 514 | dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix); |
788ba0f4 | 515 | sv_setpvn(dst, (char *)d, dend - d); |
ac5ea531 | 516 | SvUTF8_on(dst); |
788ba0f4 | 517 | Safefree(d); |
fe067ad9 SP |
518 | RETVAL = dst; |
519 | OUTPUT: | |
520 | RETVAL | |
ac5ea531 | 521 | |
788ba0f4 | 522 | |
fe067ad9 SP |
523 | SV* |
524 | NFD(src) | |
525 | SV * src | |
526 | PROTOTYPE: $ | |
527 | ALIAS: | |
528 | NFKD = 1 | |
529 | PREINIT: | |
530 | SV *dst; | |
531 | U8 *s, *t, *tend, *d, *dend; | |
532 | STRLEN slen, tlen, dlen; | |
533 | CODE: | |
a257baa9 | 534 | s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
788ba0f4 CBW |
535 | |
536 | /* decompose */ | |
fe067ad9 SP |
537 | tlen = slen; |
538 | New(0, t, tlen+1, U8); | |
a257baa9 | 539 | tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); |
fe067ad9 | 540 | *tend = '\0'; |
788ba0f4 | 541 | tlen = tend - t; /* no longer know real size of t */ |
fe067ad9 SP |
542 | |
543 | /* reorder */ | |
788ba0f4 CBW |
544 | dlen = tlen; |
545 | New(0, d, dlen+1, U8); | |
a257baa9 | 546 | dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen); |
fe067ad9 | 547 | *dend = '\0'; |
788ba0f4 | 548 | dlen = dend - d; /* no longer know real size of d */ |
ac5ea531 | 549 | |
fe067ad9 | 550 | /* return */ |
788ba0f4 CBW |
551 | dst = newSVpvn("", 0); |
552 | sv_setpvn(dst, (char *)d, dlen); | |
553 | SvUTF8_on(dst); | |
554 | ||
fe067ad9 | 555 | Safefree(t); |
788ba0f4 | 556 | Safefree(d); |
fe067ad9 SP |
557 | RETVAL = dst; |
558 | OUTPUT: | |
559 | RETVAL | |
82e740b6 | 560 | |
788ba0f4 | 561 | |
fe067ad9 SP |
562 | SV* |
563 | NFC(src) | |
564 | SV * src | |
565 | PROTOTYPE: $ | |
566 | ALIAS: | |
567 | NFKC = 1 | |
568 | FCC = 2 | |
569 | PREINIT: | |
570 | SV *dst; | |
571 | U8 *s, *t, *tend, *u, *uend, *d, *dend; | |
572 | STRLEN slen, tlen, ulen, dlen; | |
573 | CODE: | |
a257baa9 | 574 | s = (U8*)sv_2pvunicode(aTHX_ src,&slen); |
788ba0f4 CBW |
575 | |
576 | /* decompose */ | |
fe067ad9 SP |
577 | tlen = slen; |
578 | New(0, t, tlen+1, U8); | |
a257baa9 | 579 | tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); |
fe067ad9 | 580 | *tend = '\0'; |
788ba0f4 | 581 | tlen = tend - t; /* no longer know real size of t */ |
fe067ad9 SP |
582 | |
583 | /* reorder */ | |
788ba0f4 | 584 | ulen = tlen; |
fe067ad9 | 585 | New(0, u, ulen+1, U8); |
a257baa9 | 586 | uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen); |
fe067ad9 | 587 | *uend = '\0'; |
788ba0f4 | 588 | ulen = uend - u; /* no longer know real size of u */ |
fe067ad9 SP |
589 | |
590 | /* compose */ | |
788ba0f4 CBW |
591 | dlen = ulen; |
592 | New(0, d, dlen+1, U8); | |
a257baa9 | 593 | dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2)); |
fe067ad9 | 594 | *dend = '\0'; |
788ba0f4 | 595 | dlen = dend - d; /* no longer know real size of d */ |
ac5ea531 | 596 | |
fe067ad9 | 597 | /* return */ |
788ba0f4 CBW |
598 | dst = newSVpvn("", 0); |
599 | sv_setpvn(dst, (char *)d, dlen); | |
600 | SvUTF8_on(dst); | |
601 | ||
fe067ad9 SP |
602 | Safefree(t); |
603 | Safefree(u); | |
788ba0f4 | 604 | Safefree(d); |
2a204b45 JH |
605 | RETVAL = dst; |
606 | OUTPUT: | |
607 | RETVAL | |
ac5ea531 | 608 | |
788ba0f4 | 609 | |
2b8d773d | 610 | SV* |
a092bcfd RGS |
611 | checkNFD(src) |
612 | SV * src | |
8f118dcd JH |
613 | PROTOTYPE: $ |
614 | ALIAS: | |
615 | checkNFKD = 1 | |
616 | PREINIT: | |
8f118dcd JH |
617 | STRLEN srclen, retlen; |
618 | U8 *s, *e, *p, curCC, preCC; | |
2b8d773d | 619 | bool result = TRUE; |
82e740b6 | 620 | CODE: |
a257baa9 | 621 | s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); |
8f118dcd JH |
622 | e = s + srclen; |
623 | ||
624 | preCC = 0; | |
625 | for (p = s; p < e; p += retlen) { | |
fe067ad9 | 626 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 | 627 | if (!retlen) |
fe067ad9 | 628 | croak(ErrRetlenIsZero, "checkNFD or -NFKD"); |
82e740b6 | 629 | |
8f118dcd | 630 | curCC = getCombinClass(uv); |
2b8d773d RGS |
631 | if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ |
632 | result = FALSE; | |
633 | break; | |
634 | } | |
635 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { | |
636 | result = FALSE; | |
637 | break; | |
638 | } | |
8f118dcd JH |
639 | preCC = curCC; |
640 | } | |
2b8d773d RGS |
641 | RETVAL = boolSV(result); |
642 | OUTPUT: | |
643 | RETVAL | |
8f118dcd JH |
644 | |
645 | ||
2b8d773d | 646 | SV* |
a092bcfd RGS |
647 | checkNFC(src) |
648 | SV * src | |
8f118dcd JH |
649 | PROTOTYPE: $ |
650 | ALIAS: | |
651 | checkNFKC = 1 | |
652 | PREINIT: | |
8f118dcd JH |
653 | STRLEN srclen, retlen; |
654 | U8 *s, *e, *p, curCC, preCC; | |
2b8d773d RGS |
655 | bool result = TRUE; |
656 | bool isMAYBE = FALSE; | |
82e740b6 | 657 | CODE: |
a257baa9 | 658 | s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); |
8f118dcd JH |
659 | e = s + srclen; |
660 | ||
661 | preCC = 0; | |
8f118dcd | 662 | for (p = s; p < e; p += retlen) { |
fe067ad9 | 663 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 | 664 | if (!retlen) |
fe067ad9 | 665 | croak(ErrRetlenIsZero, "checkNFC or -NFKC"); |
82e740b6 | 666 | |
8f118dcd | 667 | curCC = getCombinClass(uv); |
2b8d773d RGS |
668 | if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ |
669 | result = FALSE; | |
670 | break; | |
671 | } | |
8f118dcd JH |
672 | |
673 | /* get NFC/NFKC property */ | |
674 | if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ | |
675 | ; /* YES */ | |
2b8d773d RGS |
676 | else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { |
677 | result = FALSE; | |
678 | break; | |
679 | } | |
8f118dcd JH |
680 | else if (isComp2nd(uv)) |
681 | isMAYBE = TRUE; | |
682 | else if (ix) { | |
683 | char *canon, *compat; | |
6c941e0c | 684 | /* NFKC_NO when having compatibility mapping. */ |
8f118dcd JH |
685 | canon = (char *) dec_canonical(uv); |
686 | compat = (char *) dec_compat(uv); | |
2b8d773d RGS |
687 | if (compat && !(canon && strEQ(canon, compat))) { |
688 | result = FALSE; | |
689 | break; | |
690 | } | |
8f118dcd JH |
691 | } /* end of get NFC/NFKC property */ |
692 | ||
693 | preCC = curCC; | |
694 | } | |
2b8d773d | 695 | if (isMAYBE && result) /* NO precedes MAYBE */ |
8f118dcd | 696 | XSRETURN_UNDEF; |
2b8d773d RGS |
697 | RETVAL = boolSV(result); |
698 | OUTPUT: | |
699 | RETVAL | |
8f118dcd JH |
700 | |
701 | ||
2b8d773d | 702 | SV* |
a092bcfd RGS |
703 | checkFCD(src) |
704 | SV * src | |
82e740b6 NC |
705 | PROTOTYPE: $ |
706 | ALIAS: | |
707 | checkFCC = 1 | |
708 | PREINIT: | |
fe067ad9 | 709 | STRLEN srclen, retlen; |
82e740b6 | 710 | U8 *s, *e, *p, curCC, preCC; |
2b8d773d RGS |
711 | bool result = TRUE; |
712 | bool isMAYBE = FALSE; | |
82e740b6 | 713 | CODE: |
a257baa9 | 714 | s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); |
82e740b6 | 715 | e = s + srclen; |
82e740b6 | 716 | preCC = 0; |
82e740b6 | 717 | for (p = s; p < e; p += retlen) { |
fe067ad9 SP |
718 | U8 *sCan; |
719 | UV uvLead; | |
39f4556f | 720 | STRLEN canlen = 0; |
fe067ad9 | 721 | UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF); |
82e740b6 | 722 | if (!retlen) |
fe067ad9 | 723 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); |
82e740b6 NC |
724 | |
725 | sCan = (U8*) dec_canonical(uv); | |
726 | ||
727 | if (sCan) { | |
39f4556f | 728 | STRLEN canret; |
82e740b6 | 729 | canlen = (STRLEN)strlen((char *) sCan); |
e524f5b2 | 730 | uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF); |
fe067ad9 SP |
731 | if (!canret) |
732 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); | |
82e740b6 NC |
733 | } |
734 | else { | |
735 | uvLead = uv; | |
736 | } | |
737 | ||
738 | curCC = getCombinClass(uvLead); | |
739 | ||
2b8d773d RGS |
740 | if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ |
741 | result = FALSE; | |
742 | break; | |
743 | } | |
82e740b6 NC |
744 | |
745 | if (ix) { | |
2b8d773d RGS |
746 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { |
747 | result = FALSE; | |
748 | break; | |
749 | } | |
82e740b6 NC |
750 | else if (isComp2nd(uv)) |
751 | isMAYBE = TRUE; | |
752 | } | |
753 | ||
754 | if (sCan) { | |
39f4556f | 755 | STRLEN canret; |
fe067ad9 SP |
756 | UV uvTrail; |
757 | U8* eCan = sCan + canlen; | |
758 | U8* pCan = utf8_hop(eCan, -1); | |
82e740b6 NC |
759 | if (pCan < sCan) |
760 | croak(ErrHopBeforeStart); | |
e524f5b2 | 761 | uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF); |
fe067ad9 SP |
762 | if (!canret) |
763 | croak(ErrRetlenIsZero, "checkFCD or -FCC"); | |
82e740b6 NC |
764 | preCC = getCombinClass(uvTrail); |
765 | } | |
766 | else { | |
767 | preCC = curCC; | |
768 | } | |
769 | } | |
2b8d773d | 770 | if (isMAYBE && result) /* NO precedes MAYBE */ |
82e740b6 | 771 | XSRETURN_UNDEF; |
2b8d773d RGS |
772 | RETVAL = boolSV(result); |
773 | OUTPUT: | |
774 | RETVAL | |
82e740b6 NC |
775 | |
776 | ||
ac5ea531 JH |
777 | U8 |
778 | getCombinClass(uv) | |
779 | UV uv | |
8f118dcd | 780 | PROTOTYPE: $ |
ac5ea531 JH |
781 | |
782 | bool | |
2a204b45 | 783 | isExclusion(uv) |
ac5ea531 | 784 | UV uv |
8f118dcd JH |
785 | PROTOTYPE: $ |
786 | ||
787 | bool | |
788 | isSingleton(uv) | |
789 | UV uv | |
790 | PROTOTYPE: $ | |
791 | ||
792 | bool | |
793 | isNonStDecomp(uv) | |
794 | UV uv | |
795 | PROTOTYPE: $ | |
796 | ||
797 | bool | |
798 | isComp2nd(uv) | |
799 | UV uv | |
800 | PROTOTYPE: $ | |
801 | ALIAS: | |
802 | isNFC_MAYBE = 1 | |
803 | isNFKC_MAYBE = 2 | |
804 | ||
805 | ||
806 | ||
2b8d773d | 807 | SV* |
8f118dcd JH |
808 | isNFD_NO(uv) |
809 | UV uv | |
810 | PROTOTYPE: $ | |
811 | ALIAS: | |
812 | isNFKD_NO = 1 | |
2b8d773d RGS |
813 | PREINIT: |
814 | bool result = FALSE; | |
82e740b6 | 815 | CODE: |
8f118dcd | 816 | if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) |
2b8d773d RGS |
817 | result = TRUE; /* NFD_NO or NFKD_NO */ |
818 | RETVAL = boolSV(result); | |
819 | OUTPUT: | |
820 | RETVAL | |
8f118dcd JH |
821 | |
822 | ||
2b8d773d | 823 | SV* |
8f118dcd JH |
824 | isComp_Ex(uv) |
825 | UV uv | |
826 | PROTOTYPE: $ | |
827 | ALIAS: | |
828 | isNFC_NO = 0 | |
829 | isNFKC_NO = 1 | |
2b8d773d RGS |
830 | PREINIT: |
831 | bool result = FALSE; | |
82e740b6 | 832 | CODE: |
8f118dcd | 833 | if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) |
2b8d773d | 834 | result = TRUE; /* NFC_NO or NFKC_NO */ |
8f118dcd JH |
835 | else if (ix) { |
836 | char *canon, *compat; | |
837 | canon = (char *) dec_canonical(uv); | |
838 | compat = (char *) dec_compat(uv); | |
839 | if (compat && (!canon || strNE(canon, compat))) | |
2b8d773d | 840 | result = TRUE; /* NFC_NO or NFKC_NO */ |
8f118dcd | 841 | } |
2b8d773d RGS |
842 | RETVAL = boolSV(result); |
843 | OUTPUT: | |
844 | RETVAL | |
ac5ea531 | 845 | |
2a204b45 | 846 | SV* |
ac5ea531 JH |
847 | getComposite(uv, uv2) |
848 | UV uv | |
849 | UV uv2 | |
2a204b45 JH |
850 | PROTOTYPE: $$ |
851 | PREINIT: | |
bcdb689b | 852 | UV composite; |
2a204b45 | 853 | CODE: |
bcdb689b JH |
854 | composite = composite_uv(uv, uv2); |
855 | RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; | |
2a204b45 JH |
856 | OUTPUT: |
857 | RETVAL | |
ac5ea531 | 858 | |
8f118dcd JH |
859 | |
860 | ||
ac5ea531 JH |
861 | SV* |
862 | getCanon(uv) | |
863 | UV uv | |
864 | PROTOTYPE: $ | |
865 | ALIAS: | |
866 | getCompat = 1 | |
ac5ea531 | 867 | CODE: |
8f118dcd | 868 | if (Hangul_IsS(uv)) { |
fe067ad9 SP |
869 | U8 tmp[3 * UTF8_MAXLEN + 1]; |
870 | U8 *t = tmp; | |
a257baa9 | 871 | U8 *e = pv_cat_decompHangul(aTHX_ t, uv); |
fe067ad9 | 872 | RETVAL = newSVpvn((char *)t, e - t); |
ac5ea531 | 873 | } else { |
fe067ad9 | 874 | U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); |
8f118dcd JH |
875 | if (!rstr) |
876 | XSRETURN_UNDEF; | |
ac5ea531 JH |
877 | RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); |
878 | } | |
879 | SvUTF8_on(RETVAL); | |
880 | OUTPUT: | |
881 | RETVAL | |
882 | ||
82e740b6 NC |
883 | |
884 | void | |
a092bcfd RGS |
885 | splitOnLastStarter(src) |
886 | SV * src | |
82e740b6 | 887 | PREINIT: |
a092bcfd | 888 | SV *svp; |
fe067ad9 | 889 | STRLEN srclen; |
82e740b6 NC |
890 | U8 *s, *e, *p; |
891 | PPCODE: | |
a257baa9 | 892 | s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); |
82e740b6 | 893 | e = s + srclen; |
fe067ad9 SP |
894 | p = e; |
895 | while (s < p) { | |
896 | UV uv; | |
82e740b6 NC |
897 | p = utf8_hop(p, -1); |
898 | if (p < s) | |
899 | croak(ErrHopBeforeStart); | |
fe067ad9 | 900 | uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF); |
82e740b6 NC |
901 | if (getCombinClass(uv) == 0) /* Last Starter found */ |
902 | break; | |
903 | } | |
904 | ||
905 | svp = sv_2mortal(newSVpvn((char*)s, p - s)); | |
906 | SvUTF8_on(svp); | |
907 | XPUSHs(svp); | |
908 | ||
909 | svp = sv_2mortal(newSVpvn((char*)p, e - p)); | |
910 | SvUTF8_on(svp); | |
911 | XPUSHs(svp); | |
912 |