Commit | Line | Data |
---|---|---|
ac5ea531 JH |
1 | |
2 | #include "EXTERN.h" | |
3 | #include "perl.h" | |
4 | #include "XSUB.h" | |
5 | ||
6 | /* These 5 files are prepared by mkheader */ | |
7 | #include "unfcmb.h" | |
8 | #include "unfcan.h" | |
9 | #include "unfcpt.h" | |
10 | #include "unfcmp.h" | |
11 | #include "unfexc.h" | |
12 | ||
13 | /* Perl 5.6.1 ? */ | |
14 | #ifndef uvuni_to_utf8 | |
15 | #define uvuni_to_utf8 uv_to_utf8 | |
16 | #endif /* uvuni_to_utf8 */ | |
17 | ||
18 | /* Perl 5.6.1 ? */ | |
19 | #ifndef utf8n_to_uvchr | |
20 | #define utf8n_to_uvchr utf8_to_uv | |
21 | #endif /* utf8n_to_uvchr */ | |
22 | ||
23 | /* At present, char > 0x10ffff are unaffected without complaint, right? */ | |
24 | #define VALID_UTF_MAX (0x10ffff) | |
25 | #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) | |
26 | ||
27 | /* HANGUL_H */ | |
28 | #define Hangul_SBase 0xAC00 | |
29 | #define Hangul_SFinal 0xD7A3 | |
30 | #define Hangul_SCount 11172 | |
31 | ||
32 | #define Hangul_NCount 588 | |
33 | ||
34 | #define Hangul_LBase 0x1100 | |
35 | #define Hangul_LFinal 0x1112 | |
36 | #define Hangul_LCount 19 | |
37 | ||
38 | #define Hangul_VBase 0x1161 | |
39 | #define Hangul_VFinal 0x1175 | |
40 | #define Hangul_VCount 21 | |
41 | ||
42 | #define Hangul_TBase 0x11A7 | |
43 | #define Hangul_TFinal 0x11C2 | |
44 | #define Hangul_TCount 28 | |
45 | ||
46 | #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) | |
2a204b45 | 47 | #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) |
ac5ea531 JH |
48 | #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) |
49 | #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) | |
50 | #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) | |
51 | #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) | |
52 | /* HANGUL_H */ | |
53 | ||
54 | /* this is used for canonical ordering of combining characters (c.c.). */ | |
55 | typedef struct { | |
56 | U8 cc; /* combining class */ | |
57 | UV uv; /* codepoint */ | |
58 | STRLEN pos; /* position */ | |
59 | } UNF_cc; | |
60 | ||
61 | int compare_cc(const void *a, const void *b) | |
62 | { | |
63 | int ret_cc; | |
64 | ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc; | |
65 | if(ret_cc) return ret_cc; | |
66 | return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos; | |
67 | } | |
68 | ||
69 | U8* dec_canonical (UV uv) | |
70 | { | |
71 | U8 ***plane, **row; | |
72 | if(OVER_UTF_MAX(uv)) return NULL; | |
73 | plane = (U8***)UNF_canon[uv >> 16]; | |
74 | if(! plane) return NULL; | |
75 | row = plane[(uv >> 8) & 0xff]; | |
76 | return row ? row[uv & 0xff] : NULL; | |
77 | } | |
78 | ||
79 | U8* dec_compat (UV uv) | |
80 | { | |
81 | U8 ***plane, **row; | |
82 | if(OVER_UTF_MAX(uv)) return NULL; | |
83 | plane = (U8***)UNF_compat[uv >> 16]; | |
84 | if(! plane) return NULL; | |
85 | row = plane[(uv >> 8) & 0xff]; | |
86 | return row ? row[uv & 0xff] : NULL; | |
87 | } | |
88 | ||
2a204b45 | 89 | UV composite_uv (UV uv, UV uv2) |
ac5ea531 JH |
90 | { |
91 | UNF_complist ***plane, **row, *cell, *i; | |
92 | ||
93 | if(! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) return 0; | |
94 | ||
95 | if(Hangul_IsL(uv) && Hangul_IsV(uv2)) { | |
96 | uv -= Hangul_LBase; /* lindex */ | |
97 | uv2 -= Hangul_VBase; /* vindex */ | |
98 | return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount); | |
99 | } | |
100 | if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) { | |
101 | uv2 -= Hangul_TBase; /* tindex */ | |
2a204b45 | 102 | return(uv + uv2); |
ac5ea531 JH |
103 | } |
104 | plane = UNF_compos[uv >> 16]; | |
105 | if(! plane) return 0; | |
106 | row = plane[(uv >> 8) & 0xff]; | |
2a204b45 | 107 | if(! row) return 0; |
ac5ea531 | 108 | cell = row[uv & 0xff]; |
2a204b45 | 109 | if(! cell) return 0; |
ac5ea531 JH |
110 | for(i = cell; i->nextchar; i++) { |
111 | if(uv2 == i->nextchar) return i->composite; | |
112 | } | |
113 | return 0; | |
114 | } | |
115 | ||
116 | U8 getCombinClass (UV uv) | |
117 | { | |
118 | U8 **plane, *row; | |
119 | if(OVER_UTF_MAX(uv)) return 0; | |
120 | plane = (U8**)UNF_combin[uv >> 16]; | |
121 | if(! plane) return 0; | |
122 | row = plane[(uv >> 8) & 0xff]; | |
123 | return row ? row[uv & 0xff] : 0; | |
124 | } | |
125 | ||
126 | void sv_cat_decompHangul (SV* sv, UV uv) | |
127 | { | |
128 | UV sindex, lindex, vindex, tindex; | |
2a204b45 | 129 | U8 *t, tmp[3 * UTF8_MAXLEN + 1]; |
ac5ea531 JH |
130 | |
131 | if(! Hangul_IsS(uv)) return; | |
132 | ||
133 | sindex = uv - Hangul_SBase; | |
134 | lindex = sindex / Hangul_NCount; | |
135 | vindex = (sindex % Hangul_NCount) / Hangul_TCount; | |
136 | tindex = sindex % Hangul_TCount; | |
137 | ||
2a204b45 | 138 | t = tmp; |
ac5ea531 JH |
139 | t = uvuni_to_utf8(t, (lindex + Hangul_LBase)); |
140 | t = uvuni_to_utf8(t, (vindex + Hangul_VBase)); | |
141 | if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase)); | |
142 | *t = '\0'; | |
2a204b45 | 143 | sv_catpvn(sv, (char *)tmp, strlen((char *)tmp)); |
ac5ea531 JH |
144 | } |
145 | ||
146 | MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize | |
147 | ||
ac5ea531 JH |
148 | SV* |
149 | decompose(arg, compat) | |
150 | SV * arg | |
151 | SV * compat | |
152 | PROTOTYPE: $ | |
153 | PREINIT: | |
2a204b45 | 154 | UV uv; |
ac5ea531 | 155 | SV *src, *dst; |
73263a9c JH |
156 | STRLEN srclen, retlen; |
157 | U8 *s, *e, *p, *r; | |
ac5ea531 JH |
158 | bool iscompat; |
159 | CODE: | |
160 | if(SvUTF8(arg)) { | |
161 | src = arg; | |
162 | } else { | |
163 | src = sv_mortalcopy(arg); | |
164 | sv_utf8_upgrade(src); | |
165 | } | |
ac5ea531 JH |
166 | iscompat = SvTRUE(compat); |
167 | ||
168 | dst = newSV(1); | |
169 | (void)SvPOK_only(dst); | |
170 | SvUTF8_on(dst); | |
171 | ||
172 | s = (U8*)SvPV(src,srclen); | |
173 | e = s + srclen; | |
174 | for(p = s; p < e;){ | |
175 | uv = utf8n_to_uvchr(p, e - p, &retlen, 0); | |
176 | p += retlen; | |
177 | if(Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv); | |
178 | else { | |
179 | r = iscompat ? dec_compat(uv) : dec_canonical(uv); | |
180 | if(r) sv_catpv(dst, (char *)r); | |
181 | else sv_catpvn(dst, (char *)p - retlen, retlen); | |
182 | } | |
183 | } | |
184 | RETVAL = dst; | |
185 | OUTPUT: | |
186 | RETVAL | |
187 | ||
188 | ||
189 | ||
190 | SV* | |
191 | reorder(arg) | |
192 | SV * arg | |
193 | PROTOTYPE: $ | |
194 | PREINIT: | |
195 | SV *src; | |
196 | STRLEN srclen, retlen, stk_cc_max; | |
197 | U8 *s, *e, *p, curCC; | |
198 | UV uv; | |
199 | UNF_cc * stk_cc; | |
200 | CODE: | |
201 | src = newSVsv(arg); | |
202 | if(! SvUTF8(arg)) sv_utf8_upgrade(src); | |
203 | ||
204 | stk_cc_max = 10; /* enough as an initial value? */ | |
205 | New(0, stk_cc, stk_cc_max, UNF_cc); | |
206 | ||
207 | s = (U8*)SvPV(src,srclen); | |
208 | e = s + srclen; | |
2a204b45 | 209 | |
ac5ea531 JH |
210 | for(p = s; p < e;){ |
211 | U8 *cc_in; | |
212 | STRLEN cc_len, cc_iter, cc_pos; | |
213 | ||
214 | uv = utf8n_to_uvchr(p, e - p, &retlen, 0); | |
ac5ea531 | 215 | curCC = getCombinClass(uv); |
2a204b45 JH |
216 | p += retlen; |
217 | ||
ac5ea531 JH |
218 | if(! (curCC && p < e)) continue; else cc_in = p - retlen; |
219 | ||
2a204b45 | 220 | cc_pos = 0; |
ac5ea531 JH |
221 | stk_cc[cc_pos].cc = curCC; |
222 | stk_cc[cc_pos].uv = uv; | |
223 | stk_cc[cc_pos].pos = cc_pos; | |
224 | ||
225 | while(p < e) { | |
226 | uv = utf8n_to_uvchr(p, e - p, &retlen, 0); | |
227 | curCC = getCombinClass(uv); | |
228 | if(!curCC) break; | |
229 | p += retlen; | |
230 | cc_pos++; | |
231 | if(stk_cc_max <= cc_pos) { /* extend if need */ | |
232 | stk_cc_max = cc_pos + 1; | |
233 | Renew(stk_cc, stk_cc_max, UNF_cc); | |
234 | } | |
235 | stk_cc[cc_pos].cc = curCC; | |
236 | stk_cc[cc_pos].uv = uv; | |
237 | stk_cc[cc_pos].pos = cc_pos; | |
238 | } | |
239 | ||
240 | /* only one c.c. in cc_len from cc_in, no need of reordering */ | |
241 | if(!cc_pos) continue; | |
242 | ||
243 | qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc); | |
244 | ||
245 | cc_len = p - cc_in; | |
246 | p = cc_in; | |
247 | for(cc_iter = 0; cc_iter <= cc_pos; cc_iter++) { | |
248 | p = uvuni_to_utf8(p, stk_cc[cc_iter].uv); | |
249 | } | |
250 | } | |
251 | Safefree(stk_cc); | |
252 | RETVAL = src; | |
253 | OUTPUT: | |
254 | RETVAL | |
255 | ||
256 | ||
257 | ||
2a204b45 | 258 | SV* |
ac5ea531 JH |
259 | compose(arg) |
260 | SV * arg | |
261 | PROTOTYPE: $ | |
262 | PREINIT: | |
263 | SV *src, *dst, *tmp; | |
264 | U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC; | |
265 | UV uv, uvS, uvComp; | |
2a204b45 | 266 | STRLEN srclen, dstlen, tmplen, retlen; |
ac5ea531 | 267 | bool beginning = TRUE; |
2a204b45 | 268 | CODE: |
ac5ea531 JH |
269 | if(SvUTF8(arg)) { |
270 | src = arg; | |
271 | } else { | |
272 | src = sv_mortalcopy(arg); | |
273 | sv_utf8_upgrade(src); | |
274 | } | |
2a204b45 | 275 | |
ac5ea531 JH |
276 | s = (U8*)SvPV(src, srclen); |
277 | e = s + srclen; | |
278 | dstlen = srclen + 1; /* equal or shorter, XXX */ | |
2a204b45 | 279 | dst = newSV(dstlen); |
ac5ea531 JH |
280 | (void)SvPOK_only(dst); |
281 | SvUTF8_on(dst); | |
282 | d = (U8*)SvPVX(dst); | |
283 | ||
284 | /* for uncomposed combining char */ | |
285 | tmp = sv_2mortal(newSV(dstlen)); | |
286 | (void)SvPOK_only(tmp); | |
287 | SvUTF8_on(tmp); | |
288 | ||
289 | for(p = s; p < e;){ | |
290 | if(beginning) { | |
291 | uvS = utf8n_to_uvchr(p, e - p, &retlen, 0); | |
292 | p += retlen; | |
293 | ||
294 | if (getCombinClass(uvS)){ /* no Starter found yet */ | |
295 | d = uvuni_to_utf8(d, uvS); | |
296 | continue; | |
297 | } | |
298 | beginning = FALSE; | |
299 | } | |
300 | ||
301 | /* Starter */ | |
302 | t = tmp_start = (U8*)SvPVX(tmp); | |
303 | preCC = 0; | |
304 | ||
305 | /* to the next Starter */ | |
306 | while(p < e) { | |
307 | uv = utf8n_to_uvchr(p, e - p, &retlen, 0); | |
308 | p += retlen; | |
309 | curCC = getCombinClass(uv); | |
310 | ||
311 | if(preCC && preCC == curCC) { | |
312 | preCC = curCC; | |
313 | t = uvuni_to_utf8(t, uv); | |
314 | } else { | |
2a204b45 | 315 | uvComp = composite_uv(uvS, uv); |
ac5ea531 JH |
316 | |
317 | /* S + C + S => S-S + C would be also blocked. */ | |
2a204b45 | 318 | if( uvComp && ! isExclusion(uvComp) && preCC <= curCC) |
ac5ea531 JH |
319 | { |
320 | /* preCC not changed to curCC */ | |
321 | uvS = uvComp; | |
322 | } else if (! curCC && p < e) { /* blocked */ | |
323 | break; | |
324 | } else { | |
325 | preCC = curCC; | |
326 | t = uvuni_to_utf8(t, uv); | |
327 | } | |
328 | } | |
329 | } | |
2a204b45 | 330 | d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */ |
20d72259 | 331 | if((tmplen = t - tmp_start)) { /* uncomposed combining char */ |
ac5ea531 JH |
332 | t = (U8*)SvPVX(tmp); |
333 | while(tmplen--) *d++ = *t++; | |
334 | } | |
335 | uvS = uv; | |
336 | } /* for */ | |
2a204b45 JH |
337 | e = d; /* end of dst */ |
338 | d = (U8*)SvPVX(dst); | |
339 | SvCUR_set(dst, e - d); | |
340 | RETVAL = dst; | |
341 | OUTPUT: | |
342 | RETVAL | |
ac5ea531 JH |
343 | |
344 | ||
345 | ||
346 | U8 | |
347 | getCombinClass(uv) | |
348 | UV uv | |
349 | ||
350 | bool | |
2a204b45 | 351 | isExclusion(uv) |
ac5ea531 JH |
352 | UV uv |
353 | ||
2a204b45 | 354 | SV* |
ac5ea531 JH |
355 | getComposite(uv, uv2) |
356 | UV uv | |
357 | UV uv2 | |
2a204b45 JH |
358 | PROTOTYPE: $$ |
359 | PREINIT: | |
360 | UV comp; | |
361 | CODE: | |
362 | comp = composite_uv(uv, uv2); | |
363 | RETVAL = comp ? newSVuv(comp) : &PL_sv_undef; | |
364 | OUTPUT: | |
365 | RETVAL | |
ac5ea531 JH |
366 | |
367 | SV* | |
368 | getCanon(uv) | |
369 | UV uv | |
370 | PROTOTYPE: $ | |
371 | ALIAS: | |
372 | getCompat = 1 | |
373 | PREINIT: | |
374 | U8 * rstr; | |
375 | CODE: | |
376 | if(Hangul_IsS(uv)) { | |
377 | SV * dst; | |
378 | dst = newSV(1); | |
379 | (void)SvPOK_only(dst); | |
380 | sv_cat_decompHangul(dst, uv); | |
381 | RETVAL = dst; | |
382 | } else { | |
383 | rstr = ix ? dec_compat(uv) : dec_canonical(uv); | |
384 | if(!rstr) XSRETURN_UNDEF; | |
385 | RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); | |
386 | } | |
387 | SvUTF8_on(RETVAL); | |
388 | OUTPUT: | |
389 | RETVAL | |
390 |