2 #define PERL_NO_GET_CONTEXT /* we want efficiency */
4 /* private functions which need pTHX_ and aTHX_
16 #define NEED_utf8_to_uvchr_buf
19 /* These 5 files are prepared by mkheader */
26 /* The generated normalization tables since v5.20 are in native character set
27 * terms. Prior to that, they were in Unicode terms. So we use 'uvchr' for
28 * later perls, and redefine that to be 'uvuni' for earlier ones */
29 #if PERL_VERSION_LT(5,20,0)
32 # define uvchr_to_utf8 uvuni_to_utf8
33 # else /* Perl 5.6.1 */
34 # define uvchr_to_utf8 uv_to_utf8
38 /* check if the string buffer is enough before uvchr_to_utf8(). */
39 /* dstart, d, and dlen should be defined outside before. */
40 #define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \
41 if (dlen < curlen + (need)) { \
43 Renew(dstart, dlen+1, U8); \
44 d = dstart + curlen; \
47 /* if utf8_to_uvchr_buf() sets retlen to 0 (if broken?) */
48 #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
50 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
51 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
53 /* At present, char > 0x10ffff are unaffected without complaint, right? */
54 #define VALID_UTF_MAX (0x10ffff)
55 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
57 /* size of array for combining characters */
58 /* enough as an initial value? */
59 #define CC_SEQ_SIZE (10)
60 #define CC_SEQ_STEP (5)
63 #define Hangul_SBase 0xAC00
64 #define Hangul_SFinal 0xD7A3
65 #define Hangul_SCount 11172
67 #define Hangul_NCount 588
69 #define Hangul_LBase 0x1100
70 #define Hangul_LFinal 0x1112
71 #define Hangul_LCount 19
73 #define Hangul_VBase 0x1161
74 #define Hangul_VFinal 0x1175
75 #define Hangul_VCount 21
77 #define Hangul_TBase 0x11A7
78 #define Hangul_TFinal 0x11C2
79 #define Hangul_TCount 28
81 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
82 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
83 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
84 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
85 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
86 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
89 /* this is used for canonical ordering of combining characters (c.c.). */
91 U8 cc; /* combining class */
92 UV uv; /* codepoint */
93 STRLEN pos; /* position */
96 static int compare_cc(const void *a, const void *b)
99 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
103 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
104 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
107 static U8* dec_canonical(UV uv)
110 if (OVER_UTF_MAX(uv))
112 plane = (U8***)UNF_canon[uv >> 16];
115 row = plane[(U8) (uv >> 8)];
116 return row ? row[(U8) uv] : NULL;
119 static U8* dec_compat(UV uv)
122 if (OVER_UTF_MAX(uv))
124 plane = (U8***)UNF_compat[uv >> 16];
127 row = plane[(U8) (uv >> 8)];
128 return row ? row[(U8) uv] : NULL;
131 static UV composite_uv(UV uv, UV uv2)
133 UNF_complist ***plane, **row, *cell, *i;
135 if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
138 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
139 UV lindex = uv - Hangul_LBase;
140 UV vindex = uv2 - Hangul_VBase;
141 return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
144 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
145 UV tindex = uv2 - Hangul_TBase;
148 plane = UNF_compos[uv >> 16];
151 row = plane[(U8) (uv >> 8)];
157 for (i = cell; i->nextchar; i++) {
158 if (uv2 == i->nextchar)
164 static U8 getCombinClass(UV uv)
167 if (OVER_UTF_MAX(uv))
169 plane = (U8**)UNF_combin[uv >> 16];
172 row = plane[(U8) (uv >> 8)];
173 return row ? row[(U8) uv] : 0;
176 static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv)
178 UV sindex = uv - Hangul_SBase;
179 UV lindex = sindex / Hangul_NCount;
180 UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
181 UV tindex = sindex % Hangul_TCount;
183 if (! Hangul_IsS(uv))
186 d = uvchr_to_utf8(d, (lindex + Hangul_LBase));
187 d = uvchr_to_utf8(d, (vindex + Hangul_VBase));
189 d = uvchr_to_utf8(d, (tindex + Hangul_TBase));
193 static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp)
199 SV* tmpsv = sv_2mortal(newSVpvn(s, len));
201 s = SvPV_force(tmpsv,len);
202 sv_utf8_upgrade(tmpsv);
211 U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
220 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
222 croak(ErrRetlenIsZero, "decompose");
225 if (Hangul_IsS(uv)) {
226 Renew_d_if_not_enough_to(UTF8_MAXLEN * 3)
227 d = pv_cat_decompHangul(aTHX_ d, uv);
230 U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
233 STRLEN len = (STRLEN)strlen((char *)r);
234 Renew_d_if_not_enough_to(len)
239 Renew_d_if_not_enough_to(UTF8_MAXLEN)
240 d = uvchr_to_utf8(d, uv);
249 U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen)
256 UNF_cc seq_ary[CC_SEQ_SIZE];
257 UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
258 UNF_cc* seq_ext = NULL; /* extend if need */
259 STRLEN seq_max = CC_SEQ_SIZE;
265 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
267 croak(ErrRetlenIsZero, "reorder");
270 curCC = getCombinClass(uv);
273 if (seq_max < cc_pos + 1) { /* extend if need */
274 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
275 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
277 New(0, seq_ext, seq_max, UNF_cc);
278 for (i = 0; i < cc_pos; i++)
279 seq_ext[i] = seq_ary[i];
282 Renew(seq_ext, seq_max, UNF_cc);
284 seq_ptr = seq_ext; /* use seq_ext from now */
287 seq_ptr[cc_pos].cc = curCC;
288 seq_ptr[cc_pos].uv = uv;
289 seq_ptr[cc_pos].pos = cc_pos;
300 if (cc_pos > 1) /* reordered if there are two c.c.'s */
301 qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
303 for (i = 0; i < cc_pos; i++) {
304 Renew_d_if_not_enough_to(UTF8_MAXLEN)
305 d = uvchr_to_utf8(d, seq_ptr[i].uv);
311 Renew_d_if_not_enough_to(UTF8_MAXLEN)
312 d = uvchr_to_utf8(d, uv);
322 U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig)
329 UV uvS = 0; /* code point of the starter */
330 bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
333 UV seq_ary[CC_SEQ_SIZE];
334 UV* seq_ptr = seq_ary; /* use array at the beginning */
335 UV* seq_ext = NULL; /* extend if need */
336 STRLEN seq_max = CC_SEQ_SIZE;
342 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
344 croak(ErrRetlenIsZero, "compose");
347 curCC = getCombinClass(uv);
351 uvS = uv; /* the first Starter is found */
357 Renew_d_if_not_enough_to(UTF8_MAXLEN)
358 d = uvchr_to_utf8(d, uv);
366 if ((iscontig && cc_pos) || /* discontiguous combination */
367 (curCC != 0 && preCC == curCC) || /* blocked by same CC */
368 (preCC > curCC)) /* blocked by higher CC: revised D2 */
372 iscontig && cc_pos == 0 -- contiguous combination
373 curCC == 0 && preCC == 0 -- starter + starter
374 curCC != 0 && preCC < curCC -- lower CC */
376 /* try composition */
377 UV uvComp = composite_uv(uvS, uv);
379 if (uvComp && !isExclusion(uvComp)) {
383 /* preCC should not be changed to curCC */
384 /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
394 if (curCC != 0 || !(p < e)) {
395 if (seq_max < cc_pos + 1) { /* extend if need */
396 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
397 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
398 New(0, seq_ext, seq_max, UV);
399 Copy(seq_ary, seq_ext, cc_pos, UV);
402 Renew(seq_ext, seq_max, UV);
404 seq_ptr = seq_ext; /* use seq_ext from now */
406 seq_ptr[cc_pos] = uv;
409 if (curCC != 0 && p < e)
416 Renew_d_if_not_enough_to(UTF8_MAXLEN)
417 d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */
423 for (i = 0; i < cc_pos; i++) {
424 Renew_d_if_not_enough_to(UTF8_MAXLEN)
425 d = uvchr_to_utf8(d, seq_ptr[i]);
438 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
441 decompose(src, compat = &PL_sv_no)
450 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
451 dst = newSVpvn("", 0);
453 New(0, d, dlen+1, U8);
454 dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat));
455 sv_setpvn(dst, (char *)d, dend - d);
472 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
473 dst = newSVpvn("", 0);
475 New(0, d, dlen+1, U8);
476 dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen);
477 sv_setpvn(dst, (char *)d, dend - d);
490 composeContiguous = 1
496 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
497 dst = newSVpvn("", 0);
499 New(0, d, dlen+1, U8);
500 dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix);
501 sv_setpvn(dst, (char *)d, dend - d);
517 U8 *s, *t, *tend, *d, *dend;
518 STRLEN slen, tlen, dlen;
520 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
524 New(0, t, tlen+1, U8);
525 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
527 tlen = tend - t; /* no longer know real size of t */
531 New(0, d, dlen+1, U8);
532 dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen);
534 dlen = dend - d; /* no longer know real size of d */
537 dst = newSVpvn("", 0);
538 sv_setpvn(dst, (char *)d, dlen);
557 U8 *s, *t, *tend, *u, *uend, *d, *dend;
558 STRLEN slen, tlen, ulen, dlen;
560 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
564 New(0, t, tlen+1, U8);
565 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
567 tlen = tend - t; /* no longer know real size of t */
571 New(0, u, ulen+1, U8);
572 uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen);
574 ulen = uend - u; /* no longer know real size of u */
578 New(0, d, dlen+1, U8);
579 dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2));
581 dlen = dend - d; /* no longer know real size of d */
584 dst = newSVpvn("", 0);
585 sv_setpvn(dst, (char *)d, dlen);
603 STRLEN srclen, retlen;
604 U8 *s, *e, *p, curCC, preCC;
607 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
611 for (p = s; p < e; p += retlen) {
612 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
614 croak(ErrRetlenIsZero, "checkNFD or -NFKD");
616 curCC = getCombinClass(uv);
617 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
621 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
627 RETVAL = boolSV(result);
639 STRLEN srclen, retlen;
640 U8 *s, *e, *p, curCC, preCC;
642 bool isMAYBE = FALSE;
644 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
648 for (p = s; p < e; p += retlen) {
649 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
651 croak(ErrRetlenIsZero, "checkNFC or -NFKC");
653 curCC = getCombinClass(uv);
654 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
659 /* get NFC/NFKC property */
660 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
662 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
666 else if (isComp2nd(uv))
669 char *canon, *compat;
670 /* NFKC_NO when having compatibility mapping. */
671 canon = (char *) dec_canonical(uv);
672 compat = (char *) dec_compat(uv);
673 if (compat && !(canon && strEQ(canon, compat))) {
677 } /* end of get NFC/NFKC property */
681 if (isMAYBE && result) /* NO precedes MAYBE */
683 RETVAL = boolSV(result);
695 STRLEN srclen, retlen;
696 U8 *s, *e, *p, curCC, preCC;
698 bool isMAYBE = FALSE;
700 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
703 for (p = s; p < e; p += retlen) {
707 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
709 croak(ErrRetlenIsZero, "checkFCD or -FCC");
711 sCan = (U8*) dec_canonical(uv);
715 canlen = (STRLEN)strlen((char *) sCan);
716 uvLead = utf8_to_uvchr_buf(sCan, sCan + canlen, &canret);
718 croak(ErrRetlenIsZero, "checkFCD or -FCC");
724 curCC = getCombinClass(uvLead);
726 if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
732 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
736 else if (isComp2nd(uv))
743 U8* eCan = sCan + canlen;
744 U8* pCan = utf8_hop(eCan, -1);
746 croak(ErrHopBeforeStart);
747 uvTrail = utf8_to_uvchr_buf(pCan, eCan, &canret);
749 croak(ErrRetlenIsZero, "checkFCD or -FCC");
750 preCC = getCombinClass(uvTrail);
756 if (isMAYBE && result) /* NO precedes MAYBE */
758 RETVAL = boolSV(result);
802 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
803 result = TRUE; /* NFD_NO or NFKD_NO */
804 RETVAL = boolSV(result);
819 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
820 result = TRUE; /* NFC_NO or NFKC_NO */
822 char *canon, *compat;
823 canon = (char *) dec_canonical(uv);
824 compat = (char *) dec_compat(uv);
825 if (compat && (!canon || strNE(canon, compat)))
826 result = TRUE; /* NFC_NO or NFKC_NO */
828 RETVAL = boolSV(result);
833 getComposite(uv, uv2)
840 composite = composite_uv(uv, uv2);
841 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
854 if (Hangul_IsS(uv)) {
855 U8 tmp[3 * UTF8_MAXLEN + 1];
857 U8 *e = pv_cat_decompHangul(aTHX_ t, uv);
858 RETVAL = newSVpvn((char *)t, e - t);
860 U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
863 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
871 splitOnLastStarter(src)
878 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
885 croak(ErrHopBeforeStart);
886 uv = utf8_to_uvchr_buf(p, e, NULL);
887 if (getCombinClass(uv) == 0) /* Last Starter found */
891 svp = sv_2mortal(newSVpvn((char*)s, p - s));
895 svp = sv_2mortal(newSVpvn((char*)p, e - p));