This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
amigaos4: no sigaction, no si fields
[perl5.git] / cpan / Unicode-Normalize / Normalize.xs
CommitLineData
c6b7cc21
SH
1
2#define PERL_NO_GET_CONTEXT /* we want efficiency */
3
4/* private functions which need pTHX_ and aTHX_
5 pv_cat_decompHangul
6 sv_2pvunicode
7 pv_utf8_decompose
8 pv_utf8_reorder
9 pv_utf8_compose
10*/
11
12#include "EXTERN.h"
13#include "perl.h"
14#include "XSUB.h"
15
16/* These 5 files are prepared by mkheader */
17#include "unfcmb.h"
18#include "unfcan.h"
19#include "unfcpt.h"
20#include "unfcmp.h"
21#include "unfexc.h"
22
23/* The generated normalization tables since v5.20 are in native character set
24 * terms. Prior to that, they were in Unicode terms. So we use 'uvchr' for
25 * later perls, and redefine that to be 'uvuni' for earlier ones */
26#if PERL_VERSION < 20
27# undef uvchr_to_utf8
28# ifdef uvuni_to_utf8
29# define uvchr_to_utf8 uvuni_to_utf8
30# else /* Perl 5.6.1 */
31# define uvchr_to_utf8 uv_to_utf8
32# endif
33
34# undef utf8n_to_uvchr
35# ifdef utf8n_to_uvuni
36# define utf8n_to_uvchr utf8n_to_uvuni
37# else /* Perl 5.6.1 */
38# define utf8n_to_uvchr utf8_to_uv
39# endif
40#endif
41
42/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
43#ifndef UTF8_ALLOW_BOM
44#define UTF8_ALLOW_BOM (0)
45#endif /* UTF8_ALLOW_BOM */
46
47#ifndef UTF8_ALLOW_SURROGATE
48#define UTF8_ALLOW_SURROGATE (0)
49#endif /* UTF8_ALLOW_SURROGATE */
50
51#ifndef UTF8_ALLOW_FE_FF
52#define UTF8_ALLOW_FE_FF (0)
53#endif /* UTF8_ALLOW_FE_FF */
54
55#ifndef UTF8_ALLOW_FFFF
56#define UTF8_ALLOW_FFFF (0)
57#endif /* UTF8_ALLOW_FFFF */
58
1ef95abd
SH
59#ifndef PERL_UNUSED_VAR
60# define PERL_UNUSED_VAR(x) ((void)sizeof(x))
61#endif
62
c6b7cc21
SH
63#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
64
65/* check if the string buffer is enough before uvchr_to_utf8(). */
66/* dstart, d, and dlen should be defined outside before. */
67#define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \
68 if (dlen < curlen + (need)) { \
69 dlen += (need); \
70 Renew(dstart, dlen+1, U8); \
71 d = dstart + curlen; \
72 }
73
74/* if utf8n_to_uvchr() sets retlen to 0 (if broken?) */
75#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
76
77/* utf8_hop() hops back before start. Maybe broken UTF-8 */
78#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
79
80/* At present, char > 0x10ffff are unaffected without complaint, right? */
81#define VALID_UTF_MAX (0x10ffff)
82#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
83
84/* size of array for combining characters */
85/* enough as an initial value? */
86#define CC_SEQ_SIZE (10)
87#define CC_SEQ_STEP (5)
88
89/* HANGUL begin */
90#define Hangul_SBase 0xAC00
91#define Hangul_SFinal 0xD7A3
92#define Hangul_SCount 11172
93
94#define Hangul_NCount 588
95
96#define Hangul_LBase 0x1100
97#define Hangul_LFinal 0x1112
98#define Hangul_LCount 19
99
100#define Hangul_VBase 0x1161
101#define Hangul_VFinal 0x1175
102#define Hangul_VCount 21
103
104#define Hangul_TBase 0x11A7
105#define Hangul_TFinal 0x11C2
106#define Hangul_TCount 28
107
108#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
109#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
110#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
111#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
112#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
113#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
114/* HANGUL end */
115
116/* this is used for canonical ordering of combining characters (c.c.). */
117typedef struct {
118 U8 cc; /* combining class */
119 UV uv; /* codepoint */
120 STRLEN pos; /* position */
121} UNF_cc;
122
123static int compare_cc(const void *a, const void *b)
124{
125 int ret_cc;
126 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
127 if (ret_cc)
128 return ret_cc;
129
130 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
131 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
132}
133
134static U8* dec_canonical(UV uv)
135{
136 U8 ***plane, **row;
137 if (OVER_UTF_MAX(uv))
138 return NULL;
139 plane = (U8***)UNF_canon[uv >> 16];
140 if (! plane)
141 return NULL;
142 row = plane[(uv >> 8) & 0xff];
143 return row ? row[uv & 0xff] : NULL;
144}
145
146static U8* dec_compat(UV uv)
147{
148 U8 ***plane, **row;
149 if (OVER_UTF_MAX(uv))
150 return NULL;
151 plane = (U8***)UNF_compat[uv >> 16];
152 if (! plane)
153 return NULL;
154 row = plane[(uv >> 8) & 0xff];
155 return row ? row[uv & 0xff] : NULL;
156}
157
158static UV composite_uv(UV uv, UV uv2)
159{
160 UNF_complist ***plane, **row, *cell, *i;
161
162 if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
163 return 0;
164
165 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
166 UV lindex = uv - Hangul_LBase;
167 UV vindex = uv2 - Hangul_VBase;
168 return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
169 Hangul_TCount);
170 }
171 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
172 UV tindex = uv2 - Hangul_TBase;
173 return(uv + tindex);
174 }
175 plane = UNF_compos[uv >> 16];
176 if (! plane)
177 return 0;
178 row = plane[(uv >> 8) & 0xff];
179 if (! row)
180 return 0;
181 cell = row[uv & 0xff];
182 if (! cell)
183 return 0;
184 for (i = cell; i->nextchar; i++) {
185 if (uv2 == i->nextchar)
186 return i->composite;
187 }
188 return 0;
189}
190
191static U8 getCombinClass(UV uv)
192{
193 U8 **plane, *row;
194 if (OVER_UTF_MAX(uv))
195 return 0;
196 plane = (U8**)UNF_combin[uv >> 16];
197 if (! plane)
198 return 0;
199 row = plane[(uv >> 8) & 0xff];
200 return row ? row[uv & 0xff] : 0;
201}
202
203static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv)
204{
205 UV sindex = uv - Hangul_SBase;
206 UV lindex = sindex / Hangul_NCount;
207 UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
208 UV tindex = sindex % Hangul_TCount;
209
210 if (! Hangul_IsS(uv))
211 return d;
212
213 d = uvchr_to_utf8(d, (lindex + Hangul_LBase));
214 d = uvchr_to_utf8(d, (vindex + Hangul_VBase));
215 if (tindex)
216 d = uvchr_to_utf8(d, (tindex + Hangul_TBase));
217 return d;
218}
219
220static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp)
221{
222 char *s;
223 STRLEN len;
224 s = SvPV(sv,len);
225 if (!SvUTF8(sv)) {
226 SV* tmpsv = sv_2mortal(newSVpvn(s, len));
227 if (!SvPOK(tmpsv))
228 s = SvPV_force(tmpsv,len);
229 sv_utf8_upgrade(tmpsv);
230 s = SvPV(tmpsv,len);
231 }
232 if (lp)
233 *lp = len;
234 return s;
235}
236
237static
238U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
239{
240 U8* p = s;
241 U8* e = s + slen;
242 U8* dstart = *dp;
243 U8* d = dstart;
244
245 while (p < e) {
246 STRLEN retlen;
247 UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
248 if (!retlen)
249 croak(ErrRetlenIsZero, "decompose");
250 p += retlen;
251
252 if (Hangul_IsS(uv)) {
253 Renew_d_if_not_enough_to(UTF8_MAXLEN * 3)
254 d = pv_cat_decompHangul(aTHX_ d, uv);
255 }
256 else {
257 U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
258
259 if (r) {
260 STRLEN len = (STRLEN)strlen((char *)r);
261 Renew_d_if_not_enough_to(len)
262 while (len--)
263 *d++ = *r++;
264 }
265 else {
266 Renew_d_if_not_enough_to(UTF8_MAXLEN)
267 d = uvchr_to_utf8(d, uv);
268 }
269 }
270 }
271 *dp = dstart;
272 return d;
273}
274
275static
276U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen)
277{
278 U8* p = s;
279 U8* e = s + slen;
280 U8* dstart = *dp;
281 U8* d = dstart;
282
283 UNF_cc seq_ary[CC_SEQ_SIZE];
284 UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
285 UNF_cc* seq_ext = NULL; /* extend if need */
286 STRLEN seq_max = CC_SEQ_SIZE;
287 STRLEN cc_pos = 0;
288
289 while (p < e) {
290 U8 curCC;
291 STRLEN retlen;
292 UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
293 if (!retlen)
294 croak(ErrRetlenIsZero, "reorder");
295 p += retlen;
296
297 curCC = getCombinClass(uv);
298
299 if (curCC != 0) {
300 if (seq_max < cc_pos + 1) { /* extend if need */
301 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
302 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
303 STRLEN i;
304 New(0, seq_ext, seq_max, UNF_cc);
305 for (i = 0; i < cc_pos; i++)
306 seq_ext[i] = seq_ary[i];
307 }
308 else {
309 Renew(seq_ext, seq_max, UNF_cc);
310 }
311 seq_ptr = seq_ext; /* use seq_ext from now */
312 }
313
314 seq_ptr[cc_pos].cc = curCC;
315 seq_ptr[cc_pos].uv = uv;
316 seq_ptr[cc_pos].pos = cc_pos;
317 ++cc_pos;
318
319 if (p < e)
320 continue;
321 }
322
323 /* output */
324 if (cc_pos) {
325 STRLEN i;
326
327 if (cc_pos > 1) /* reordered if there are two c.c.'s */
328 qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
329
330 for (i = 0; i < cc_pos; i++) {
331 Renew_d_if_not_enough_to(UTF8_MAXLEN)
332 d = uvchr_to_utf8(d, seq_ptr[i].uv);
333 }
334 cc_pos = 0;
335 }
336
337 if (curCC == 0) {
338 Renew_d_if_not_enough_to(UTF8_MAXLEN)
339 d = uvchr_to_utf8(d, uv);
340 }
341 }
342 if (seq_ext)
343 Safefree(seq_ext);
344 *dp = dstart;
345 return d;
346}
347
348static
349U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig)
350{
351 U8* p = s;
352 U8* e = s + slen;
353 U8* dstart = *dp;
354 U8* d = dstart;
355
356 UV uvS = 0; /* code point of the starter */
357 bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
358 U8 preCC = 0;
359
360 UV seq_ary[CC_SEQ_SIZE];
361 UV* seq_ptr = seq_ary; /* use array at the beginning */
362 UV* seq_ext = NULL; /* extend if need */
363 STRLEN seq_max = CC_SEQ_SIZE;
364 STRLEN cc_pos = 0;
365
366 while (p < e) {
367 U8 curCC;
368 STRLEN retlen;
369 UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
370 if (!retlen)
371 croak(ErrRetlenIsZero, "compose");
372 p += retlen;
373
374 curCC = getCombinClass(uv);
375
376 if (!valid_uvS) {
377 if (curCC == 0) {
378 uvS = uv; /* the first Starter is found */
379 valid_uvS = TRUE;
380 if (p < e)
381 continue;
382 }
383 else {
384 Renew_d_if_not_enough_to(UTF8_MAXLEN)
385 d = uvchr_to_utf8(d, uv);
386 continue;
387 }
388 }
389 else {
390 bool composed;
391
392 /* blocked */
393 if ((iscontig && cc_pos) || /* discontiguous combination */
394 (curCC != 0 && preCC == curCC) || /* blocked by same CC */
395 (preCC > curCC)) /* blocked by higher CC: revised D2 */
396 composed = FALSE;
397
398 /* not blocked:
399 iscontig && cc_pos == 0 -- contiguous combination
400 curCC == 0 && preCC == 0 -- starter + starter
401 curCC != 0 && preCC < curCC -- lower CC */
402 else {
403 /* try composition */
404 UV uvComp = composite_uv(uvS, uv);
405
406 if (uvComp && !isExclusion(uvComp)) {
407 uvS = uvComp;
408 composed = TRUE;
409
410 /* preCC should not be changed to curCC */
411 /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
412 if (p < e)
413 continue;
414 }
415 else
416 composed = FALSE;
417 }
418
419 if (!composed) {
420 preCC = curCC;
421 if (curCC != 0 || !(p < e)) {
422 if (seq_max < cc_pos + 1) { /* extend if need */
423 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
424 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
425 New(0, seq_ext, seq_max, UV);
426 Copy(seq_ary, seq_ext, cc_pos, UV);
427 }
428 else {
429 Renew(seq_ext, seq_max, UV);
430 }
431 seq_ptr = seq_ext; /* use seq_ext from now */
432 }
433 seq_ptr[cc_pos] = uv;
434 ++cc_pos;
435 }
436 if (curCC != 0 && p < e)
437 continue;
438 }
439 }
440
441 /* output */
442 {
443 Renew_d_if_not_enough_to(UTF8_MAXLEN)
444 d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */
445 }
446
447 if (cc_pos) {
448 STRLEN i;
449
450 for (i = 0; i < cc_pos; i++) {
451 Renew_d_if_not_enough_to(UTF8_MAXLEN)
452 d = uvchr_to_utf8(d, seq_ptr[i]);
453 }
454 cc_pos = 0;
455 }
456
457 uvS = uv;
458 }
459 if (seq_ext)
460 Safefree(seq_ext);
461 *dp = dstart;
462 return d;
463}
464
465MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
466
467SV*
468decompose(src, compat = &PL_sv_no)
469 SV * src
470 SV * compat
471 PROTOTYPE: $;$
472 PREINIT:
473 SV* dst;
474 U8 *s, *d, *dend;
475 STRLEN slen, dlen;
476 CODE:
477 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
478 dst = newSVpvn("", 0);
479 dlen = slen;
480 New(0, d, dlen+1, U8);
481 dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat));
482 sv_setpvn(dst, (char *)d, dend - d);
483 SvUTF8_on(dst);
484 Safefree(d);
485 RETVAL = dst;
486 OUTPUT:
487 RETVAL
488
489
490SV*
491reorder(src)
492 SV * src
493 PROTOTYPE: $
494 PREINIT:
495 SV* dst;
496 U8 *s, *d, *dend;
497 STRLEN slen, dlen;
498 CODE:
499 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
500 dst = newSVpvn("", 0);
501 dlen = slen;
502 New(0, d, dlen+1, U8);
503 dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen);
504 sv_setpvn(dst, (char *)d, dend - d);
505 SvUTF8_on(dst);
506 Safefree(d);
507 RETVAL = dst;
508 OUTPUT:
509 RETVAL
510
511
512SV*
513compose(src)
514 SV * src
515 PROTOTYPE: $
516 ALIAS:
517 composeContiguous = 1
518 PREINIT:
519 SV* dst;
520 U8 *s, *d, *dend;
521 STRLEN slen, dlen;
522 CODE:
523 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
524 dst = newSVpvn("", 0);
525 dlen = slen;
526 New(0, d, dlen+1, U8);
527 dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix);
528 sv_setpvn(dst, (char *)d, dend - d);
529 SvUTF8_on(dst);
530 Safefree(d);
531 RETVAL = dst;
532 OUTPUT:
533 RETVAL
534
535
536SV*
537NFD(src)
538 SV * src
539 PROTOTYPE: $
540 ALIAS:
541 NFKD = 1
542 PREINIT:
543 SV *dst;
544 U8 *s, *t, *tend, *d, *dend;
545 STRLEN slen, tlen, dlen;
546 CODE:
547 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
548
549 /* decompose */
550 tlen = slen;
551 New(0, t, tlen+1, U8);
552 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
553 *tend = '\0';
554 tlen = tend - t; /* no longer know real size of t */
555
556 /* reorder */
557 dlen = tlen;
558 New(0, d, dlen+1, U8);
559 dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen);
560 *dend = '\0';
561 dlen = dend - d; /* no longer know real size of d */
562
563 /* return */
564 dst = newSVpvn("", 0);
565 sv_setpvn(dst, (char *)d, dlen);
566 SvUTF8_on(dst);
567
568 Safefree(t);
569 Safefree(d);
570 RETVAL = dst;
571 OUTPUT:
572 RETVAL
573
574
575SV*
576NFC(src)
577 SV * src
578 PROTOTYPE: $
579 ALIAS:
580 NFKC = 1
581 FCC = 2
582 PREINIT:
583 SV *dst;
584 U8 *s, *t, *tend, *u, *uend, *d, *dend;
585 STRLEN slen, tlen, ulen, dlen;
586 CODE:
587 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
588
589 /* decompose */
590 tlen = slen;
591 New(0, t, tlen+1, U8);
592 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
593 *tend = '\0';
594 tlen = tend - t; /* no longer know real size of t */
595
596 /* reorder */
597 ulen = tlen;
598 New(0, u, ulen+1, U8);
599 uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen);
600 *uend = '\0';
601 ulen = uend - u; /* no longer know real size of u */
602
603 /* compose */
604 dlen = ulen;
605 New(0, d, dlen+1, U8);
606 dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2));
607 *dend = '\0';
608 dlen = dend - d; /* no longer know real size of d */
609
610 /* return */
611 dst = newSVpvn("", 0);
612 sv_setpvn(dst, (char *)d, dlen);
613 SvUTF8_on(dst);
614
615 Safefree(t);
616 Safefree(u);
617 Safefree(d);
618 RETVAL = dst;
619 OUTPUT:
620 RETVAL
621
622
623SV*
624checkNFD(src)
625 SV * src
626 PROTOTYPE: $
627 ALIAS:
628 checkNFKD = 1
629 PREINIT:
630 STRLEN srclen, retlen;
631 U8 *s, *e, *p, curCC, preCC;
632 bool result = TRUE;
633 CODE:
634 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
635 e = s + srclen;
636
637 preCC = 0;
638 for (p = s; p < e; p += retlen) {
639 UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
640 if (!retlen)
641 croak(ErrRetlenIsZero, "checkNFD or -NFKD");
642
643 curCC = getCombinClass(uv);
644 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
645 result = FALSE;
646 break;
647 }
648 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
649 result = FALSE;
650 break;
651 }
652 preCC = curCC;
653 }
654 RETVAL = boolSV(result);
655 OUTPUT:
656 RETVAL
657
658
659SV*
660checkNFC(src)
661 SV * src
662 PROTOTYPE: $
663 ALIAS:
664 checkNFKC = 1
665 PREINIT:
666 STRLEN srclen, retlen;
667 U8 *s, *e, *p, curCC, preCC;
668 bool result = TRUE;
669 bool isMAYBE = FALSE;
670 CODE:
671 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
672 e = s + srclen;
673
674 preCC = 0;
675 for (p = s; p < e; p += retlen) {
676 UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
677 if (!retlen)
678 croak(ErrRetlenIsZero, "checkNFC or -NFKC");
679
680 curCC = getCombinClass(uv);
681 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
682 result = FALSE;
683 break;
684 }
685
686 /* get NFC/NFKC property */
687 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
688 ; /* YES */
689 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
690 result = FALSE;
691 break;
692 }
693 else if (isComp2nd(uv))
694 isMAYBE = TRUE;
695 else if (ix) {
696 char *canon, *compat;
697 /* NFKC_NO when having compatibility mapping. */
698 canon = (char *) dec_canonical(uv);
699 compat = (char *) dec_compat(uv);
700 if (compat && !(canon && strEQ(canon, compat))) {
701 result = FALSE;
702 break;
703 }
704 } /* end of get NFC/NFKC property */
705
706 preCC = curCC;
707 }
708 if (isMAYBE && result) /* NO precedes MAYBE */
709 XSRETURN_UNDEF;
710 RETVAL = boolSV(result);
711 OUTPUT:
712 RETVAL
713
714
715SV*
716checkFCD(src)
717 SV * src
718 PROTOTYPE: $
719 ALIAS:
720 checkFCC = 1
721 PREINIT:
722 STRLEN srclen, retlen;
723 U8 *s, *e, *p, curCC, preCC;
724 bool result = TRUE;
725 bool isMAYBE = FALSE;
726 CODE:
727 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
728 e = s + srclen;
729 preCC = 0;
730 for (p = s; p < e; p += retlen) {
731 U8 *sCan;
732 UV uvLead;
733 STRLEN canlen = 0;
734 UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
735 if (!retlen)
736 croak(ErrRetlenIsZero, "checkFCD or -FCC");
737
738 sCan = (U8*) dec_canonical(uv);
739
740 if (sCan) {
741 STRLEN canret;
742 canlen = (STRLEN)strlen((char *) sCan);
743 uvLead = utf8n_to_uvchr(sCan, canlen, &canret, AllowAnyUTF);
744 if (!canret)
745 croak(ErrRetlenIsZero, "checkFCD or -FCC");
746 }
747 else {
748 uvLead = uv;
749 }
750
751 curCC = getCombinClass(uvLead);
752
753 if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
754 result = FALSE;
755 break;
756 }
757
758 if (ix) {
759 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
760 result = FALSE;
761 break;
762 }
763 else if (isComp2nd(uv))
764 isMAYBE = TRUE;
765 }
766
767 if (sCan) {
768 STRLEN canret;
769 UV uvTrail;
770 U8* eCan = sCan + canlen;
771 U8* pCan = utf8_hop(eCan, -1);
772 if (pCan < sCan)
773 croak(ErrHopBeforeStart);
774 uvTrail = utf8n_to_uvchr(pCan, eCan - pCan, &canret, AllowAnyUTF);
775 if (!canret)
776 croak(ErrRetlenIsZero, "checkFCD or -FCC");
777 preCC = getCombinClass(uvTrail);
778 }
779 else {
780 preCC = curCC;
781 }
782 }
783 if (isMAYBE && result) /* NO precedes MAYBE */
784 XSRETURN_UNDEF;
785 RETVAL = boolSV(result);
786 OUTPUT:
787 RETVAL
788
789
790U8
791getCombinClass(uv)
792 UV uv
793 PROTOTYPE: $
794
795bool
796isExclusion(uv)
797 UV uv
798 PROTOTYPE: $
799
800bool
801isSingleton(uv)
802 UV uv
803 PROTOTYPE: $
804
805bool
806isNonStDecomp(uv)
807 UV uv
808 PROTOTYPE: $
809
810bool
811isComp2nd(uv)
812 UV uv
813 PROTOTYPE: $
814 ALIAS:
815 isNFC_MAYBE = 1
816 isNFKC_MAYBE = 2
662aea32
CBW
817 INIT:
818 PERL_UNUSED_VAR(ix);
c6b7cc21
SH
819
820SV*
821isNFD_NO(uv)
822 UV uv
823 PROTOTYPE: $
824 ALIAS:
825 isNFKD_NO = 1
826 PREINIT:
827 bool result = FALSE;
828 CODE:
829 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
830 result = TRUE; /* NFD_NO or NFKD_NO */
831 RETVAL = boolSV(result);
832 OUTPUT:
833 RETVAL
834
835
836SV*
837isComp_Ex(uv)
838 UV uv
839 PROTOTYPE: $
840 ALIAS:
841 isNFC_NO = 0
842 isNFKC_NO = 1
843 PREINIT:
844 bool result = FALSE;
845 CODE:
846 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
847 result = TRUE; /* NFC_NO or NFKC_NO */
848 else if (ix) {
849 char *canon, *compat;
850 canon = (char *) dec_canonical(uv);
851 compat = (char *) dec_compat(uv);
852 if (compat && (!canon || strNE(canon, compat)))
853 result = TRUE; /* NFC_NO or NFKC_NO */
854 }
855 RETVAL = boolSV(result);
856 OUTPUT:
857 RETVAL
858
859SV*
860getComposite(uv, uv2)
861 UV uv
862 UV uv2
863 PROTOTYPE: $$
864 PREINIT:
865 UV composite;
866 CODE:
867 composite = composite_uv(uv, uv2);
868 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
869 OUTPUT:
870 RETVAL
871
872
873
874SV*
875getCanon(uv)
876 UV uv
877 PROTOTYPE: $
878 ALIAS:
879 getCompat = 1
880 CODE:
881 if (Hangul_IsS(uv)) {
882 U8 tmp[3 * UTF8_MAXLEN + 1];
883 U8 *t = tmp;
884 U8 *e = pv_cat_decompHangul(aTHX_ t, uv);
885 RETVAL = newSVpvn((char *)t, e - t);
886 } else {
887 U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
888 if (!rstr)
889 XSRETURN_UNDEF;
890 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
891 }
892 SvUTF8_on(RETVAL);
893 OUTPUT:
894 RETVAL
895
896
897void
898splitOnLastStarter(src)
899 SV * src
900 PREINIT:
901 SV *svp;
902 STRLEN srclen;
903 U8 *s, *e, *p;
904 PPCODE:
905 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
906 e = s + srclen;
907 p = e;
908 while (s < p) {
909 UV uv;
910 p = utf8_hop(p, -1);
911 if (p < s)
912 croak(ErrHopBeforeStart);
913 uv = utf8n_to_uvchr(p, e - p, NULL, AllowAnyUTF);
914 if (getCombinClass(uv) == 0) /* Last Starter found */
915 break;
916 }
917
918 svp = sv_2mortal(newSVpvn((char*)s, p - s));
919 SvUTF8_on(svp);
920 XPUSHs(svp);
921
922 svp = sv_2mortal(newSVpvn((char*)p, e - p));
923 SvUTF8_on(svp);
924 XPUSHs(svp);
925