This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Convert all Pod::Simple files to Unix line endings.
[perl5.git] / cpan / Unicode-Normalize / Normalize.xs
CommitLineData
ac5ea531
JH
1
2#include "EXTERN.h"
3#include "perl.h"
4#include "XSUB.h"
5
6/* These 5 files are prepared by mkheader */
7#include "unfcmb.h"
8#include "unfcan.h"
9#include "unfcpt.h"
10#include "unfcmp.h"
11#include "unfexc.h"
12
13/* Perl 5.6.1 ? */
14#ifndef uvuni_to_utf8
15#define uvuni_to_utf8 uv_to_utf8
6c941e0c 16#endif /* uvuni_to_utf8 */
ac5ea531
JH
17
18/* Perl 5.6.1 ? */
ab8fe378
JH
19#ifndef utf8n_to_uvuni
20#define utf8n_to_uvuni utf8_to_uv
6c941e0c 21#endif /* utf8n_to_uvuni */
ac5ea531 22
e524f5b2
NC
23/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
24#ifdef UTF8_ALLOW_BOM
25#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FFFF)
628bbff0 26#else
e524f5b2
NC
27#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
28#endif
29
30/* if utf8n_to_uvuni() sets retlen to 0 (?) */
fe067ad9 31#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
82e740b6
NC
32
33/* utf8_hop() hops back before start. Maybe broken UTF-8 */
34#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
35
51683ce6
TS
36/* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC.
37 If Unicode would add a new composition of A + B to C
38 where bytes::length(A) + bytes::length(B) < bytes::length(C),
39 this code should be fixed.
40 In this case, mkheader will prevent Unicode::Normalize from building. */
fe067ad9
SP
41#define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source"
42
43/* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */
44#define ErrTargetNotEnough "panic (Unicode::Normalize %s): target not enough"
45
ac5ea531
JH
46/* At present, char > 0x10ffff are unaffected without complaint, right? */
47#define VALID_UTF_MAX (0x10ffff)
48#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
49
fe067ad9
SP
50/* size of array for combining characters */
51/* enough as an initial value? */
52#define CC_SEQ_SIZE (10)
53#define CC_SEQ_STEP (5)
54
55/* HANGUL begin */
ac5ea531
JH
56#define Hangul_SBase 0xAC00
57#define Hangul_SFinal 0xD7A3
58#define Hangul_SCount 11172
59
60#define Hangul_NCount 588
61
62#define Hangul_LBase 0x1100
63#define Hangul_LFinal 0x1112
64#define Hangul_LCount 19
65
66#define Hangul_VBase 0x1161
67#define Hangul_VFinal 0x1175
68#define Hangul_VCount 21
69
70#define Hangul_TBase 0x11A7
71#define Hangul_TFinal 0x11C2
72#define Hangul_TCount 28
73
74#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
2a204b45 75#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
ac5ea531
JH
76#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
77#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
78#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
79#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
fe067ad9 80/* HANGUL end */
ac5ea531
JH
81
82/* this is used for canonical ordering of combining characters (c.c.). */
83typedef struct {
84 U8 cc; /* combining class */
85 UV uv; /* codepoint */
86 STRLEN pos; /* position */
87} UNF_cc;
88
fe067ad9 89static int compare_cc(const void *a, const void *b)
ac5ea531
JH
90{
91 int ret_cc;
6c941e0c 92 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
8f118dcd
JH
93 if (ret_cc)
94 return ret_cc;
6c941e0c
JH
95
96 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
97 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
ac5ea531
JH
98}
99
fe067ad9 100static U8* dec_canonical(UV uv)
ac5ea531
JH
101{
102 U8 ***plane, **row;
8f118dcd
JH
103 if (OVER_UTF_MAX(uv))
104 return NULL;
ac5ea531 105 plane = (U8***)UNF_canon[uv >> 16];
8f118dcd
JH
106 if (! plane)
107 return NULL;
ac5ea531
JH
108 row = plane[(uv >> 8) & 0xff];
109 return row ? row[uv & 0xff] : NULL;
110}
111
fe067ad9 112static U8* dec_compat(UV uv)
ac5ea531
JH
113{
114 U8 ***plane, **row;
8f118dcd
JH
115 if (OVER_UTF_MAX(uv))
116 return NULL;
ac5ea531 117 plane = (U8***)UNF_compat[uv >> 16];
8f118dcd
JH
118 if (! plane)
119 return NULL;
ac5ea531
JH
120 row = plane[(uv >> 8) & 0xff];
121 return row ? row[uv & 0xff] : NULL;
122}
123
fe067ad9 124static UV composite_uv(UV uv, UV uv2)
ac5ea531
JH
125{
126 UNF_complist ***plane, **row, *cell, *i;
127
fe067ad9 128 if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
8f118dcd 129 return 0;
ac5ea531 130
8f118dcd 131 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
fe067ad9
SP
132 UV lindex = uv - Hangul_LBase;
133 UV vindex = uv2 - Hangul_VBase;
134 return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
135 Hangul_TCount);
ac5ea531 136 }
8f118dcd 137 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
fe067ad9
SP
138 UV tindex = uv2 - Hangul_TBase;
139 return(uv + tindex);
ac5ea531
JH
140 }
141 plane = UNF_compos[uv >> 16];
8f118dcd
JH
142 if (! plane)
143 return 0;
ac5ea531 144 row = plane[(uv >> 8) & 0xff];
8f118dcd
JH
145 if (! row)
146 return 0;
ac5ea531 147 cell = row[uv & 0xff];
8f118dcd
JH
148 if (! cell)
149 return 0;
150 for (i = cell; i->nextchar; i++) {
151 if (uv2 == i->nextchar)
152 return i->composite;
ac5ea531
JH
153 }
154 return 0;
155}
156
fe067ad9 157static U8 getCombinClass(UV uv)
ac5ea531
JH
158{
159 U8 **plane, *row;
8f118dcd
JH
160 if (OVER_UTF_MAX(uv))
161 return 0;
ac5ea531 162 plane = (U8**)UNF_combin[uv >> 16];
8f118dcd
JH
163 if (! plane)
164 return 0;
ac5ea531
JH
165 row = plane[(uv >> 8) & 0xff];
166 return row ? row[uv & 0xff] : 0;
167}
168
fe067ad9 169static U8* pv_cat_decompHangul(U8* d, UV uv)
ac5ea531 170{
fe067ad9
SP
171 UV sindex = uv - Hangul_SBase;
172 UV lindex = sindex / Hangul_NCount;
173 UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
174 UV tindex = sindex % Hangul_TCount;
ac5ea531 175
8f118dcd 176 if (! Hangul_IsS(uv))
fe067ad9 177 return d;
ac5ea531 178
fe067ad9
SP
179 d = uvuni_to_utf8(d, (lindex + Hangul_LBase));
180 d = uvuni_to_utf8(d, (vindex + Hangul_VBase));
8f118dcd 181 if (tindex)
fe067ad9
SP
182 d = uvuni_to_utf8(d, (tindex + Hangul_TBase));
183 return d;
ac5ea531
JH
184}
185
39f4556f 186static char* sv_2pvunicode(SV *sv, STRLEN *lp)
a092bcfd
RGS
187{
188 char *s;
189 STRLEN len;
39f4556f 190 s = SvPV(sv,len);
a092bcfd 191 if (!SvUTF8(sv)) {
39f4556f 192 SV* tmpsv = sv_2mortal(newSVpvn(s, len));
a092bcfd 193 if (!SvPOK(tmpsv))
39f4556f 194 s = SvPV_force(tmpsv,len);
a092bcfd 195 sv_utf8_upgrade(tmpsv);
39f4556f 196 s = SvPV(tmpsv,len);
a092bcfd 197 }
fe067ad9
SP
198 if (lp)
199 *lp = len;
a092bcfd
RGS
200 return s;
201}
202
fe067ad9
SP
203static
204U8* pv_utf8_decompose(U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
205{
206 U8* p = s;
207 U8* e = s + slen;
208 U8* dstart = *dp;
209 U8* d = dstart;
210
211 while (p < e) {
212 STRLEN retlen;
213 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 214 if (!retlen)
fe067ad9
SP
215 croak(ErrRetlenIsZero, "decompose");
216 p += retlen;
217
218 if (Hangul_IsS(uv)) {
219 STRLEN cur = d - dstart;
82e740b6 220
fe067ad9
SP
221 if (dlen < cur + UTF8_MAXLEN * 3) {
222 dlen += UTF8_MAXLEN * 3;
223 Renew(dstart, dlen+1, U8);
224 d = dstart + cur;
225 }
226 d = pv_cat_decompHangul(d, uv);
227 }
ac5ea531 228 else {
fe067ad9
SP
229 U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
230
231 if (r) {
232 STRLEN len = (STRLEN)strlen((char *)r);
233 STRLEN cur = d - dstart;
234 if (dlen < cur + len) {
235 dlen += len;
236 Renew(dstart, dlen+1, U8);
237 d = dstart + cur;
238 }
239 while (len--)
240 *d++ = *r++;
241 }
242 else {
243 STRLEN cur = d - dstart;
244
245 if (dlen < cur + UTF8_MAXLEN) {
246 dlen += UTF8_MAXLEN;
247 Renew(dstart, dlen+1, U8);
248 d = dstart + cur;
249 }
250 d = uvuni_to_utf8(d, uv);
251 }
ac5ea531
JH
252 }
253 }
fe067ad9
SP
254 *dp = dstart;
255 return d;
256}
ac5ea531 257
fe067ad9
SP
258static
259U8* pv_utf8_reorder(U8* s, STRLEN slen, U8* d, STRLEN dlen)
260{
261 U8* p = s;
262 U8* e = s + slen;
263 U8* dend = d + dlen;
264
265 UNF_cc seq_ary[CC_SEQ_SIZE];
266 UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
267 UNF_cc* seq_ext = NULL; /* extend if need */
268 STRLEN seq_max = CC_SEQ_SIZE;
269 STRLEN cc_pos = 0;
270
271 if (dlen < slen || dlen < slen + UTF8_MAXLEN)
272 croak(ErrTargetNotEnough, "reorder");
273 dend -= UTF8_MAXLEN; /* safety */
274
275 while (p < e) {
276 U8 curCC;
277 STRLEN retlen;
278 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
279 if (!retlen)
280 croak(ErrRetlenIsZero, "reorder");
281 p += retlen;
ac5ea531 282
fe067ad9 283 curCC = getCombinClass(uv);
ac5ea531 284
fe067ad9
SP
285 if (curCC != 0) {
286 if (seq_max < cc_pos + 1) { /* extend if need */
287 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
288 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
289 STRLEN i;
290 New(0, seq_ext, seq_max, UNF_cc);
291 for (i = 0; i < cc_pos; i++)
292 seq_ext[i] = seq_ary[i];
293 }
294 else {
295 Renew(seq_ext, seq_max, UNF_cc);
296 }
39f4556f 297 seq_ptr = seq_ext; /* use seq_ext from now */
fe067ad9 298 }
a092bcfd 299
fe067ad9
SP
300 seq_ptr[cc_pos].cc = curCC;
301 seq_ptr[cc_pos].uv = uv;
302 seq_ptr[cc_pos].pos = cc_pos;
303 ++cc_pos;
ac5ea531 304
fe067ad9
SP
305 if (p < e)
306 continue;
307 }
ac5ea531 308
fe067ad9
SP
309 if (cc_pos) {
310 STRLEN i;
311
312 if (cc_pos > 1) /* reordered if there are two c.c.'s */
313 qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
314
315 for (i = 0; i < cc_pos; i++) {
316 d = uvuni_to_utf8(d, seq_ptr[i].uv);
317 if (dend < d) /* real end is dend + UTF8_MAXLEN */
318 croak(ErrLongerThanSrc, "reorder");
319 }
320 cc_pos = 0;
321 }
2a204b45 322
e524f5b2
NC
323 if (curCC == 0) {
324 d = uvuni_to_utf8(d, uv);
fe067ad9
SP
325 if (dend < d) /* real end is dend + UTF8_MAXLEN */
326 croak(ErrLongerThanSrc, "reorder");
e524f5b2 327 }
fe067ad9
SP
328 }
329 if (seq_ext)
330 Safefree(seq_ext);
331 return d;
332}
ac5ea531 333
fe067ad9
SP
334static
335U8* pv_utf8_compose(U8* s, STRLEN slen, U8* d, STRLEN dlen, bool iscontig)
336{
337 U8* p = s;
338 U8* e = s + slen;
339 U8* dend = d + dlen;
340
39f4556f 341 UV uvS = 0; /* code point of the starter */
fe067ad9
SP
342 bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
343 U8 preCC = 0;
344
345 UV seq_ary[CC_SEQ_SIZE];
346 UV* seq_ptr = seq_ary; /* use array at the beginning */
347 UV* seq_ext = NULL; /* extend if need */
348 STRLEN seq_max = CC_SEQ_SIZE;
349 STRLEN cc_pos = 0;
350
351 if (dlen < slen || dlen < slen + UTF8_MAXLEN)
352 croak(ErrTargetNotEnough, "compose");
353 dend -= UTF8_MAXLEN; /* safety */
354
355 while (p < e) {
356 U8 curCC;
357 STRLEN retlen;
358 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
359 if (!retlen)
360 croak(ErrRetlenIsZero, "compose");
361 p += retlen;
ac5ea531 362
fe067ad9 363 curCC = getCombinClass(uv);
82e740b6 364
fe067ad9 365 if (!valid_uvS) {
e524f5b2 366 if (curCC == 0) {
fe067ad9
SP
367 uvS = uv; /* the first Starter is found */
368 valid_uvS = TRUE;
369 if (p < e)
370 continue;
e524f5b2 371 }
fe067ad9
SP
372 else {
373 d = uvuni_to_utf8(d, uv);
374 if (dend < d) /* real end is dend + UTF8_MAXLEN */
375 croak(ErrLongerThanSrc, "compose");
376 continue;
377 }
378 }
379 else {
380 bool composed;
381
382 /* blocked */
383 if (iscontig && cc_pos || /* discontiguous combination */
384 curCC != 0 && preCC == curCC || /* blocked by same CC */
385 preCC > curCC) /* blocked by higher CC: revised D2 */
386 composed = FALSE;
387
388 /* not blocked:
389 iscontig && cc_pos == 0 -- contiguous combination
390 curCC == 0 && preCC == 0 -- starter + starter
391 curCC != 0 && preCC < curCC -- lower CC */
392 else {
393 /* try composition */
394 UV uvComp = composite_uv(uvS, uv);
395
396 if (uvComp && !isExclusion(uvComp)) {
397 uvS = uvComp;
398 composed = TRUE;
82e740b6 399
fe067ad9
SP
400 /* preCC should not be changed to curCC */
401 /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
402 if (p < e)
403 continue;
404 }
405 else
406 composed = FALSE;
407 }
408
409 if (!composed) {
410 preCC = curCC;
411 if (curCC != 0 || !(p < e)) {
412 if (seq_max < cc_pos + 1) { /* extend if need */
413 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
414 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
415 New(0, seq_ext, seq_max, UV);
416 Copy(seq_ary, seq_ext, cc_pos, UV);
417 }
418 else {
419 Renew(seq_ext, seq_max, UV);
420 }
2b8d773d 421 seq_ptr = seq_ext; /* use seq_ext from now */
fe067ad9
SP
422 }
423 seq_ptr[cc_pos] = uv;
424 ++cc_pos;
425 }
426 if (curCC != 0 && p < e)
427 continue;
ac5ea531 428 }
ac5ea531
JH
429 }
430
fe067ad9
SP
431 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
432 if (dend < d) /* real end is dend + UTF8_MAXLEN */
433 croak(ErrLongerThanSrc, "compose");
434
e524f5b2 435 if (cc_pos) {
fe067ad9 436 STRLEN i;
ac5ea531 437
fe067ad9
SP
438 for (i = 0; i < cc_pos; i++) {
439 d = uvuni_to_utf8(d, seq_ptr[i]);
440 if (dend < d) /* real end is dend + UTF8_MAXLEN */
441 croak(ErrLongerThanSrc, "compose");
442 }
443 cc_pos = 0;
ac5ea531 444 }
fe067ad9
SP
445
446 uvS = uv;
ac5ea531 447 }
fe067ad9
SP
448 if (seq_ext)
449 Safefree(seq_ext);
450 return d;
451}
452
453MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
454
455SV*
456decompose(src, compat = &PL_sv_no)
457 SV * src
458 SV * compat
459 PROTOTYPE: $;$
460 PREINIT:
461 SV* dst;
462 U8 *s, *d, *dend;
463 STRLEN slen, dlen;
464 CODE:
465 s = (U8*)sv_2pvunicode(src,&slen);
466 dst = newSVpvn("", 0);
467 dlen = slen;
468 New(0, d, dlen+1, U8);
39f4556f 469 dend = pv_utf8_decompose(s, slen, &d, dlen, (bool)SvTRUE(compat));
9b374906 470 sv_setpvn(dst, (char *)d, dend - d);
fe067ad9
SP
471 SvUTF8_on(dst);
472 Safefree(d);
8f118dcd 473 RETVAL = dst;
ac5ea531
JH
474 OUTPUT:
475 RETVAL
476
fe067ad9
SP
477SV*
478reorder(src)
479 SV * src
480 PROTOTYPE: $
481 PREINIT:
482 SV* dst;
483 U8 *s, *d, *dend;
484 STRLEN slen, dlen;
485 CODE:
486 s = (U8*)sv_2pvunicode(src,&slen);
487 dst = newSVpvn("", 0);
488 dlen = slen + UTF8_MAXLEN;
489 d = (U8*)SvGROW(dst,dlen+1);
490 SvUTF8_on(dst);
491 dend = pv_utf8_reorder(s, slen, d, dlen);
492 *dend = '\0';
493 SvCUR_set(dst, dend - d);
494 RETVAL = dst;
495 OUTPUT:
496 RETVAL
ac5ea531 497
2a204b45 498SV*
a092bcfd
RGS
499compose(src)
500 SV * src
ac5ea531 501 PROTOTYPE: $
82e740b6
NC
502 ALIAS:
503 composeContiguous = 1
ac5ea531 504 PREINIT:
fe067ad9
SP
505 SV* dst;
506 U8 *s, *d, *dend;
507 STRLEN slen, dlen;
2a204b45 508 CODE:
fe067ad9
SP
509 s = (U8*)sv_2pvunicode(src,&slen);
510 dst = newSVpvn("", 0);
511 dlen = slen + UTF8_MAXLEN;
512 d = (U8*)SvGROW(dst,dlen+1);
ac5ea531 513 SvUTF8_on(dst);
fe067ad9
SP
514 dend = pv_utf8_compose(s, slen, d, dlen, (bool)ix);
515 *dend = '\0';
516 SvCUR_set(dst, dend - d);
517 RETVAL = dst;
518 OUTPUT:
519 RETVAL
ac5ea531 520
fe067ad9
SP
521SV*
522NFD(src)
523 SV * src
524 PROTOTYPE: $
525 ALIAS:
526 NFKD = 1
527 PREINIT:
528 SV *dst;
529 U8 *s, *t, *tend, *d, *dend;
530 STRLEN slen, tlen, dlen;
531 CODE:
532 /* decompose */
533 s = (U8*)sv_2pvunicode(src,&slen);
534 tlen = slen;
535 New(0, t, tlen+1, U8);
536 tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)ix);
537 *tend = '\0';
538 tlen = tend - t; /* no longer know real tlen */
539
540 /* reorder */
541 dst = newSVpvn("", 0);
542 dlen = tlen + UTF8_MAXLEN;
543 d = (U8*)SvGROW(dst,dlen+1);
544 SvUTF8_on(dst);
545 dend = pv_utf8_reorder(t, tlen, d, dlen);
546 *dend = '\0';
547 SvCUR_set(dst, dend - d);
ac5ea531 548
fe067ad9
SP
549 /* return */
550 Safefree(t);
551 RETVAL = dst;
552 OUTPUT:
553 RETVAL
82e740b6 554
fe067ad9
SP
555SV*
556NFC(src)
557 SV * src
558 PROTOTYPE: $
559 ALIAS:
560 NFKC = 1
561 FCC = 2
562 PREINIT:
563 SV *dst;
564 U8 *s, *t, *tend, *u, *uend, *d, *dend;
565 STRLEN slen, tlen, ulen, dlen;
566 CODE:
567 /* decompose */
568 s = (U8*)sv_2pvunicode(src,&slen);
569 tlen = slen;
570 New(0, t, tlen+1, U8);
571 tend = pv_utf8_decompose(s, slen, &t, tlen, (bool)(ix==1));
572 *tend = '\0';
573 tlen = tend - t; /* no longer know real tlen */
574
575 /* reorder */
576 ulen = tlen + UTF8_MAXLEN;
577 New(0, u, ulen+1, U8);
578 uend = pv_utf8_reorder(t, tlen, u, ulen);
579 *uend = '\0';
580 ulen = uend - u;
581
582 /* compose */
583 dst = newSVpvn("", 0);
584 dlen = ulen + UTF8_MAXLEN;
585 d = (U8*)SvGROW(dst,dlen+1);
586 SvUTF8_on(dst);
587 dend = pv_utf8_compose(u, ulen, d, dlen, (bool)(ix==2));
588 *dend = '\0';
589 SvCUR_set(dst, dend - d);
ac5ea531 590
fe067ad9
SP
591 /* return */
592 Safefree(t);
593 Safefree(u);
2a204b45
JH
594 RETVAL = dst;
595 OUTPUT:
596 RETVAL
ac5ea531 597
2b8d773d 598SV*
a092bcfd
RGS
599checkNFD(src)
600 SV * src
8f118dcd
JH
601 PROTOTYPE: $
602 ALIAS:
603 checkNFKD = 1
604 PREINIT:
8f118dcd
JH
605 STRLEN srclen, retlen;
606 U8 *s, *e, *p, curCC, preCC;
2b8d773d 607 bool result = TRUE;
82e740b6 608 CODE:
a092bcfd 609 s = (U8*)sv_2pvunicode(src,&srclen);
8f118dcd
JH
610 e = s + srclen;
611
612 preCC = 0;
613 for (p = s; p < e; p += retlen) {
fe067ad9 614 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 615 if (!retlen)
fe067ad9 616 croak(ErrRetlenIsZero, "checkNFD or -NFKD");
82e740b6 617
8f118dcd 618 curCC = getCombinClass(uv);
2b8d773d
RGS
619 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
620 result = FALSE;
621 break;
622 }
623 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
624 result = FALSE;
625 break;
626 }
8f118dcd
JH
627 preCC = curCC;
628 }
2b8d773d
RGS
629 RETVAL = boolSV(result);
630 OUTPUT:
631 RETVAL
8f118dcd
JH
632
633
2b8d773d 634SV*
a092bcfd
RGS
635checkNFC(src)
636 SV * src
8f118dcd
JH
637 PROTOTYPE: $
638 ALIAS:
639 checkNFKC = 1
640 PREINIT:
8f118dcd
JH
641 STRLEN srclen, retlen;
642 U8 *s, *e, *p, curCC, preCC;
2b8d773d
RGS
643 bool result = TRUE;
644 bool isMAYBE = FALSE;
82e740b6 645 CODE:
a092bcfd 646 s = (U8*)sv_2pvunicode(src,&srclen);
8f118dcd
JH
647 e = s + srclen;
648
649 preCC = 0;
8f118dcd 650 for (p = s; p < e; p += retlen) {
fe067ad9 651 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 652 if (!retlen)
fe067ad9 653 croak(ErrRetlenIsZero, "checkNFC or -NFKC");
82e740b6 654
8f118dcd 655 curCC = getCombinClass(uv);
2b8d773d
RGS
656 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
657 result = FALSE;
658 break;
659 }
8f118dcd
JH
660
661 /* get NFC/NFKC property */
662 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
663 ; /* YES */
2b8d773d
RGS
664 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
665 result = FALSE;
666 break;
667 }
8f118dcd
JH
668 else if (isComp2nd(uv))
669 isMAYBE = TRUE;
670 else if (ix) {
671 char *canon, *compat;
6c941e0c 672 /* NFKC_NO when having compatibility mapping. */
8f118dcd
JH
673 canon = (char *) dec_canonical(uv);
674 compat = (char *) dec_compat(uv);
2b8d773d
RGS
675 if (compat && !(canon && strEQ(canon, compat))) {
676 result = FALSE;
677 break;
678 }
8f118dcd
JH
679 } /* end of get NFC/NFKC property */
680
681 preCC = curCC;
682 }
2b8d773d 683 if (isMAYBE && result) /* NO precedes MAYBE */
8f118dcd 684 XSRETURN_UNDEF;
2b8d773d
RGS
685 RETVAL = boolSV(result);
686 OUTPUT:
687 RETVAL
8f118dcd
JH
688
689
2b8d773d 690SV*
a092bcfd
RGS
691checkFCD(src)
692 SV * src
82e740b6
NC
693 PROTOTYPE: $
694 ALIAS:
695 checkFCC = 1
696 PREINIT:
fe067ad9 697 STRLEN srclen, retlen;
82e740b6 698 U8 *s, *e, *p, curCC, preCC;
2b8d773d
RGS
699 bool result = TRUE;
700 bool isMAYBE = FALSE;
82e740b6 701 CODE:
a092bcfd 702 s = (U8*)sv_2pvunicode(src,&srclen);
82e740b6 703 e = s + srclen;
82e740b6 704 preCC = 0;
82e740b6 705 for (p = s; p < e; p += retlen) {
fe067ad9
SP
706 U8 *sCan;
707 UV uvLead;
39f4556f 708 STRLEN canlen = 0;
fe067ad9 709 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 710 if (!retlen)
fe067ad9 711 croak(ErrRetlenIsZero, "checkFCD or -FCC");
82e740b6
NC
712
713 sCan = (U8*) dec_canonical(uv);
714
715 if (sCan) {
39f4556f 716 STRLEN canret;
82e740b6 717 canlen = (STRLEN)strlen((char *) sCan);
e524f5b2 718 uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
fe067ad9
SP
719 if (!canret)
720 croak(ErrRetlenIsZero, "checkFCD or -FCC");
82e740b6
NC
721 }
722 else {
723 uvLead = uv;
724 }
725
726 curCC = getCombinClass(uvLead);
727
2b8d773d
RGS
728 if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
729 result = FALSE;
730 break;
731 }
82e740b6
NC
732
733 if (ix) {
2b8d773d
RGS
734 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
735 result = FALSE;
736 break;
737 }
82e740b6
NC
738 else if (isComp2nd(uv))
739 isMAYBE = TRUE;
740 }
741
742 if (sCan) {
39f4556f 743 STRLEN canret;
fe067ad9
SP
744 UV uvTrail;
745 U8* eCan = sCan + canlen;
746 U8* pCan = utf8_hop(eCan, -1);
82e740b6
NC
747 if (pCan < sCan)
748 croak(ErrHopBeforeStart);
e524f5b2 749 uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
fe067ad9
SP
750 if (!canret)
751 croak(ErrRetlenIsZero, "checkFCD or -FCC");
82e740b6
NC
752 preCC = getCombinClass(uvTrail);
753 }
754 else {
755 preCC = curCC;
756 }
757 }
2b8d773d 758 if (isMAYBE && result) /* NO precedes MAYBE */
82e740b6 759 XSRETURN_UNDEF;
2b8d773d
RGS
760 RETVAL = boolSV(result);
761 OUTPUT:
762 RETVAL
82e740b6
NC
763
764
ac5ea531
JH
765U8
766getCombinClass(uv)
767 UV uv
8f118dcd 768 PROTOTYPE: $
ac5ea531
JH
769
770bool
2a204b45 771isExclusion(uv)
ac5ea531 772 UV uv
8f118dcd
JH
773 PROTOTYPE: $
774
775bool
776isSingleton(uv)
777 UV uv
778 PROTOTYPE: $
779
780bool
781isNonStDecomp(uv)
782 UV uv
783 PROTOTYPE: $
784
785bool
786isComp2nd(uv)
787 UV uv
788 PROTOTYPE: $
789 ALIAS:
790 isNFC_MAYBE = 1
791 isNFKC_MAYBE = 2
792
793
794
2b8d773d 795SV*
8f118dcd
JH
796isNFD_NO(uv)
797 UV uv
798 PROTOTYPE: $
799 ALIAS:
800 isNFKD_NO = 1
2b8d773d
RGS
801 PREINIT:
802 bool result = FALSE;
82e740b6 803 CODE:
8f118dcd 804 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
2b8d773d
RGS
805 result = TRUE; /* NFD_NO or NFKD_NO */
806 RETVAL = boolSV(result);
807 OUTPUT:
808 RETVAL
8f118dcd
JH
809
810
2b8d773d 811SV*
8f118dcd
JH
812isComp_Ex(uv)
813 UV uv
814 PROTOTYPE: $
815 ALIAS:
816 isNFC_NO = 0
817 isNFKC_NO = 1
2b8d773d
RGS
818 PREINIT:
819 bool result = FALSE;
82e740b6 820 CODE:
8f118dcd 821 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
2b8d773d 822 result = TRUE; /* NFC_NO or NFKC_NO */
8f118dcd
JH
823 else if (ix) {
824 char *canon, *compat;
825 canon = (char *) dec_canonical(uv);
826 compat = (char *) dec_compat(uv);
827 if (compat && (!canon || strNE(canon, compat)))
2b8d773d 828 result = TRUE; /* NFC_NO or NFKC_NO */
8f118dcd 829 }
2b8d773d
RGS
830 RETVAL = boolSV(result);
831 OUTPUT:
832 RETVAL
ac5ea531 833
2a204b45 834SV*
ac5ea531
JH
835getComposite(uv, uv2)
836 UV uv
837 UV uv2
2a204b45
JH
838 PROTOTYPE: $$
839 PREINIT:
bcdb689b 840 UV composite;
2a204b45 841 CODE:
bcdb689b
JH
842 composite = composite_uv(uv, uv2);
843 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
2a204b45
JH
844 OUTPUT:
845 RETVAL
ac5ea531 846
8f118dcd
JH
847
848
ac5ea531
JH
849SV*
850getCanon(uv)
851 UV uv
852 PROTOTYPE: $
853 ALIAS:
854 getCompat = 1
ac5ea531 855 CODE:
8f118dcd 856 if (Hangul_IsS(uv)) {
fe067ad9
SP
857 U8 tmp[3 * UTF8_MAXLEN + 1];
858 U8 *t = tmp;
859 U8 *e = pv_cat_decompHangul(t, uv);
860 RETVAL = newSVpvn((char *)t, e - t);
ac5ea531 861 } else {
fe067ad9 862 U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
8f118dcd
JH
863 if (!rstr)
864 XSRETURN_UNDEF;
ac5ea531
JH
865 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
866 }
867 SvUTF8_on(RETVAL);
868 OUTPUT:
869 RETVAL
870
82e740b6
NC
871
872void
a092bcfd
RGS
873splitOnLastStarter(src)
874 SV * src
82e740b6 875 PREINIT:
a092bcfd 876 SV *svp;
fe067ad9 877 STRLEN srclen;
82e740b6
NC
878 U8 *s, *e, *p;
879 PPCODE:
a092bcfd 880 s = (U8*)sv_2pvunicode(src,&srclen);
82e740b6 881 e = s + srclen;
fe067ad9
SP
882 p = e;
883 while (s < p) {
884 UV uv;
82e740b6
NC
885 p = utf8_hop(p, -1);
886 if (p < s)
887 croak(ErrHopBeforeStart);
fe067ad9 888 uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF);
82e740b6
NC
889 if (getCombinClass(uv) == 0) /* Last Starter found */
890 break;
891 }
892
893 svp = sv_2mortal(newSVpvn((char*)s, p - s));
894 SvUTF8_on(svp);
895 XPUSHs(svp);
896
897 svp = sv_2mortal(newSVpvn((char*)p, e - p));
898 SvUTF8_on(svp);
899 XPUSHs(svp);
900