2 #define PERL_NO_GET_CONTEXT /* we want efficiency */
4 /* private functions which need pTHX_ and aTHX_
16 /* These 5 files are prepared by mkheader */
25 #define uvuni_to_utf8 uv_to_utf8
26 #endif /* uvuni_to_utf8 */
29 #ifndef utf8n_to_uvuni
30 #define utf8n_to_uvuni utf8_to_uv
31 #endif /* utf8n_to_uvuni */
33 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
34 #ifndef UTF8_ALLOW_BOM
35 #define UTF8_ALLOW_BOM (0)
36 #endif /* UTF8_ALLOW_BOM */
38 #ifndef UTF8_ALLOW_SURROGATE
39 #define UTF8_ALLOW_SURROGATE (0)
40 #endif /* UTF8_ALLOW_SURROGATE */
42 #ifndef UTF8_ALLOW_FE_FF
43 #define UTF8_ALLOW_FE_FF (0)
44 #endif /* UTF8_ALLOW_FE_FF */
46 #ifndef UTF8_ALLOW_FFFF
47 #define UTF8_ALLOW_FFFF (0)
48 #endif /* UTF8_ALLOW_FFFF */
50 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
52 /* check if the string buffer is enough before uvuni_to_utf8(). */
53 /* dstart, d, and dlen should be defined outside before. */
54 #define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \
55 if (dlen < curlen + (need)) { \
57 Renew(dstart, dlen+1, U8); \
58 d = dstart + curlen; \
61 /* if utf8n_to_uvuni() sets retlen to 0 (if broken?) */
62 #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
64 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
65 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
67 /* At present, char > 0x10ffff are unaffected without complaint, right? */
68 #define VALID_UTF_MAX (0x10ffff)
69 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
71 /* size of array for combining characters */
72 /* enough as an initial value? */
73 #define CC_SEQ_SIZE (10)
74 #define CC_SEQ_STEP (5)
77 #define Hangul_SBase 0xAC00
78 #define Hangul_SFinal 0xD7A3
79 #define Hangul_SCount 11172
81 #define Hangul_NCount 588
83 #define Hangul_LBase 0x1100
84 #define Hangul_LFinal 0x1112
85 #define Hangul_LCount 19
87 #define Hangul_VBase 0x1161
88 #define Hangul_VFinal 0x1175
89 #define Hangul_VCount 21
91 #define Hangul_TBase 0x11A7
92 #define Hangul_TFinal 0x11C2
93 #define Hangul_TCount 28
95 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
96 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
97 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
98 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
99 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
100 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
103 /* this is used for canonical ordering of combining characters (c.c.). */
105 U8 cc; /* combining class */
106 UV uv; /* codepoint */
107 STRLEN pos; /* position */
110 static int compare_cc(const void *a, const void *b)
113 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
117 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
118 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
121 static U8* dec_canonical(UV uv)
124 if (OVER_UTF_MAX(uv))
126 plane = (U8***)UNF_canon[uv >> 16];
129 row = plane[(uv >> 8) & 0xff];
130 return row ? row[uv & 0xff] : NULL;
133 static U8* dec_compat(UV uv)
136 if (OVER_UTF_MAX(uv))
138 plane = (U8***)UNF_compat[uv >> 16];
141 row = plane[(uv >> 8) & 0xff];
142 return row ? row[uv & 0xff] : NULL;
145 static UV composite_uv(UV uv, UV uv2)
147 UNF_complist ***plane, **row, *cell, *i;
149 if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
152 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
153 UV lindex = uv - Hangul_LBase;
154 UV vindex = uv2 - Hangul_VBase;
155 return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
158 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
159 UV tindex = uv2 - Hangul_TBase;
162 plane = UNF_compos[uv >> 16];
165 row = plane[(uv >> 8) & 0xff];
168 cell = row[uv & 0xff];
171 for (i = cell; i->nextchar; i++) {
172 if (uv2 == i->nextchar)
178 static U8 getCombinClass(UV uv)
181 if (OVER_UTF_MAX(uv))
183 plane = (U8**)UNF_combin[uv >> 16];
186 row = plane[(uv >> 8) & 0xff];
187 return row ? row[uv & 0xff] : 0;
190 static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv)
192 UV sindex = uv - Hangul_SBase;
193 UV lindex = sindex / Hangul_NCount;
194 UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
195 UV tindex = sindex % Hangul_TCount;
197 if (! Hangul_IsS(uv))
200 d = uvuni_to_utf8(d, (lindex + Hangul_LBase));
201 d = uvuni_to_utf8(d, (vindex + Hangul_VBase));
203 d = uvuni_to_utf8(d, (tindex + Hangul_TBase));
207 static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp)
213 SV* tmpsv = sv_2mortal(newSVpvn(s, len));
215 s = SvPV_force(tmpsv,len);
216 sv_utf8_upgrade(tmpsv);
225 U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
234 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
236 croak(ErrRetlenIsZero, "decompose");
239 if (Hangul_IsS(uv)) {
240 Renew_d_if_not_enough_to(UTF8_MAXLEN * 3)
241 d = pv_cat_decompHangul(aTHX_ d, uv);
244 U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
247 STRLEN len = (STRLEN)strlen((char *)r);
248 Renew_d_if_not_enough_to(len)
253 Renew_d_if_not_enough_to(UTF8_MAXLEN)
254 d = uvuni_to_utf8(d, uv);
263 U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen)
270 UNF_cc seq_ary[CC_SEQ_SIZE];
271 UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
272 UNF_cc* seq_ext = NULL; /* extend if need */
273 STRLEN seq_max = CC_SEQ_SIZE;
279 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
281 croak(ErrRetlenIsZero, "reorder");
284 curCC = getCombinClass(uv);
287 if (seq_max < cc_pos + 1) { /* extend if need */
288 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
289 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
291 New(0, seq_ext, seq_max, UNF_cc);
292 for (i = 0; i < cc_pos; i++)
293 seq_ext[i] = seq_ary[i];
296 Renew(seq_ext, seq_max, UNF_cc);
298 seq_ptr = seq_ext; /* use seq_ext from now */
301 seq_ptr[cc_pos].cc = curCC;
302 seq_ptr[cc_pos].uv = uv;
303 seq_ptr[cc_pos].pos = cc_pos;
314 if (cc_pos > 1) /* reordered if there are two c.c.'s */
315 qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
317 for (i = 0; i < cc_pos; i++) {
318 Renew_d_if_not_enough_to(UTF8_MAXLEN)
319 d = uvuni_to_utf8(d, seq_ptr[i].uv);
325 Renew_d_if_not_enough_to(UTF8_MAXLEN)
326 d = uvuni_to_utf8(d, uv);
336 U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig)
343 UV uvS = 0; /* code point of the starter */
344 bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
347 UV seq_ary[CC_SEQ_SIZE];
348 UV* seq_ptr = seq_ary; /* use array at the beginning */
349 UV* seq_ext = NULL; /* extend if need */
350 STRLEN seq_max = CC_SEQ_SIZE;
356 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
358 croak(ErrRetlenIsZero, "compose");
361 curCC = getCombinClass(uv);
365 uvS = uv; /* the first Starter is found */
371 Renew_d_if_not_enough_to(UTF8_MAXLEN)
372 d = uvuni_to_utf8(d, uv);
380 if ((iscontig && cc_pos) || /* discontiguous combination */
381 (curCC != 0 && preCC == curCC) || /* blocked by same CC */
382 (preCC > curCC)) /* blocked by higher CC: revised D2 */
386 iscontig && cc_pos == 0 -- contiguous combination
387 curCC == 0 && preCC == 0 -- starter + starter
388 curCC != 0 && preCC < curCC -- lower CC */
390 /* try composition */
391 UV uvComp = composite_uv(uvS, uv);
393 if (uvComp && !isExclusion(uvComp)) {
397 /* preCC should not be changed to curCC */
398 /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
408 if (curCC != 0 || !(p < e)) {
409 if (seq_max < cc_pos + 1) { /* extend if need */
410 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
411 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
412 New(0, seq_ext, seq_max, UV);
413 Copy(seq_ary, seq_ext, cc_pos, UV);
416 Renew(seq_ext, seq_max, UV);
418 seq_ptr = seq_ext; /* use seq_ext from now */
420 seq_ptr[cc_pos] = uv;
423 if (curCC != 0 && p < e)
430 Renew_d_if_not_enough_to(UTF8_MAXLEN)
431 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
437 for (i = 0; i < cc_pos; i++) {
438 Renew_d_if_not_enough_to(UTF8_MAXLEN)
439 d = uvuni_to_utf8(d, seq_ptr[i]);
452 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
455 decompose(src, compat = &PL_sv_no)
464 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
465 dst = newSVpvn("", 0);
467 New(0, d, dlen+1, U8);
468 dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat));
469 sv_setpvn(dst, (char *)d, dend - d);
486 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
487 dst = newSVpvn("", 0);
489 New(0, d, dlen+1, U8);
490 dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen);
491 sv_setpvn(dst, (char *)d, dend - d);
504 composeContiguous = 1
510 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
511 dst = newSVpvn("", 0);
513 New(0, d, dlen+1, U8);
514 dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix);
515 sv_setpvn(dst, (char *)d, dend - d);
531 U8 *s, *t, *tend, *d, *dend;
532 STRLEN slen, tlen, dlen;
534 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
538 New(0, t, tlen+1, U8);
539 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
541 tlen = tend - t; /* no longer know real size of t */
545 New(0, d, dlen+1, U8);
546 dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen);
548 dlen = dend - d; /* no longer know real size of d */
551 dst = newSVpvn("", 0);
552 sv_setpvn(dst, (char *)d, dlen);
571 U8 *s, *t, *tend, *u, *uend, *d, *dend;
572 STRLEN slen, tlen, ulen, dlen;
574 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
578 New(0, t, tlen+1, U8);
579 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
581 tlen = tend - t; /* no longer know real size of t */
585 New(0, u, ulen+1, U8);
586 uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen);
588 ulen = uend - u; /* no longer know real size of u */
592 New(0, d, dlen+1, U8);
593 dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2));
595 dlen = dend - d; /* no longer know real size of d */
598 dst = newSVpvn("", 0);
599 sv_setpvn(dst, (char *)d, dlen);
617 STRLEN srclen, retlen;
618 U8 *s, *e, *p, curCC, preCC;
621 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
625 for (p = s; p < e; p += retlen) {
626 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
628 croak(ErrRetlenIsZero, "checkNFD or -NFKD");
630 curCC = getCombinClass(uv);
631 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
635 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
641 RETVAL = boolSV(result);
653 STRLEN srclen, retlen;
654 U8 *s, *e, *p, curCC, preCC;
656 bool isMAYBE = FALSE;
658 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
662 for (p = s; p < e; p += retlen) {
663 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
665 croak(ErrRetlenIsZero, "checkNFC or -NFKC");
667 curCC = getCombinClass(uv);
668 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
673 /* get NFC/NFKC property */
674 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
676 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
680 else if (isComp2nd(uv))
683 char *canon, *compat;
684 /* NFKC_NO when having compatibility mapping. */
685 canon = (char *) dec_canonical(uv);
686 compat = (char *) dec_compat(uv);
687 if (compat && !(canon && strEQ(canon, compat))) {
691 } /* end of get NFC/NFKC property */
695 if (isMAYBE && result) /* NO precedes MAYBE */
697 RETVAL = boolSV(result);
709 STRLEN srclen, retlen;
710 U8 *s, *e, *p, curCC, preCC;
712 bool isMAYBE = FALSE;
714 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
717 for (p = s; p < e; p += retlen) {
721 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
723 croak(ErrRetlenIsZero, "checkFCD or -FCC");
725 sCan = (U8*) dec_canonical(uv);
729 canlen = (STRLEN)strlen((char *) sCan);
730 uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
732 croak(ErrRetlenIsZero, "checkFCD or -FCC");
738 curCC = getCombinClass(uvLead);
740 if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
746 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
750 else if (isComp2nd(uv))
757 U8* eCan = sCan + canlen;
758 U8* pCan = utf8_hop(eCan, -1);
760 croak(ErrHopBeforeStart);
761 uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
763 croak(ErrRetlenIsZero, "checkFCD or -FCC");
764 preCC = getCombinClass(uvTrail);
770 if (isMAYBE && result) /* NO precedes MAYBE */
772 RETVAL = boolSV(result);
816 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
817 result = TRUE; /* NFD_NO or NFKD_NO */
818 RETVAL = boolSV(result);
833 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
834 result = TRUE; /* NFC_NO or NFKC_NO */
836 char *canon, *compat;
837 canon = (char *) dec_canonical(uv);
838 compat = (char *) dec_compat(uv);
839 if (compat && (!canon || strNE(canon, compat)))
840 result = TRUE; /* NFC_NO or NFKC_NO */
842 RETVAL = boolSV(result);
847 getComposite(uv, uv2)
854 composite = composite_uv(uv, uv2);
855 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
868 if (Hangul_IsS(uv)) {
869 U8 tmp[3 * UTF8_MAXLEN + 1];
871 U8 *e = pv_cat_decompHangul(aTHX_ t, uv);
872 RETVAL = newSVpvn((char *)t, e - t);
874 U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
877 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
885 splitOnLastStarter(src)
892 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
899 croak(ErrHopBeforeStart);
900 uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF);
901 if (getCombinClass(uv) == 0) /* Last Starter found */
905 svp = sv_2mortal(newSVpvn((char*)s, p - s));
909 svp = sv_2mortal(newSVpvn((char*)p, e - p));