/* Perl 5.6.1 ? */
#ifndef uvuni_to_utf8
#define uvuni_to_utf8 uv_to_utf8
-#endif /* uvuni_to_utf8 */
+#endif /* uvuni_to_utf8 */
/* Perl 5.6.1 ? */
-#ifndef utf8n_to_uvchr
-#define utf8n_to_uvchr utf8_to_uv
-#endif /* utf8n_to_uvchr */
+#ifndef utf8n_to_uvuni
+#define utf8n_to_uvuni utf8_to_uv
+#endif /* utf8n_to_uvuni */
/* At present, char > 0x10ffff are unaffected without complaint, right? */
#define VALID_UTF_MAX (0x10ffff)
#define Hangul_TCount 28
#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
-#define Hangul_IsN(u) (! (((u) - Hangul_SBase) % Hangul_TCount))
+#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
STRLEN pos; /* position */
} UNF_cc;
-int compare_cc(const void *a, const void *b)
+int compare_cc (const void *a, const void *b)
{
int ret_cc;
- ret_cc = (*(UNF_cc*)a).cc - (*(UNF_cc*)b).cc;
- if(ret_cc) return ret_cc;
- return (*(UNF_cc*)a).pos - (*(UNF_cc*)b).pos;
+ ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
+ if (ret_cc)
+ return ret_cc;
+
+ return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
+ - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
}
U8* dec_canonical (UV uv)
{
U8 ***plane, **row;
- if(OVER_UTF_MAX(uv)) return NULL;
+ if (OVER_UTF_MAX(uv))
+ return NULL;
plane = (U8***)UNF_canon[uv >> 16];
- if(! plane) return NULL;
+ if (! plane)
+ return NULL;
row = plane[(uv >> 8) & 0xff];
return row ? row[uv & 0xff] : NULL;
}
U8* dec_compat (UV uv)
{
U8 ***plane, **row;
- if(OVER_UTF_MAX(uv)) return NULL;
+ if (OVER_UTF_MAX(uv))
+ return NULL;
plane = (U8***)UNF_compat[uv >> 16];
- if(! plane) return NULL;
+ if (! plane)
+ return NULL;
row = plane[(uv >> 8) & 0xff];
return row ? row[uv & 0xff] : NULL;
}
-UV getComposite (UV uv, UV uv2)
+UV composite_uv (UV uv, UV uv2)
{
UNF_complist ***plane, **row, *cell, *i;
- if(! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) return 0;
+ if (! uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
+ return 0;
- if(Hangul_IsL(uv) && Hangul_IsV(uv2)) {
+ if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
uv -= Hangul_LBase; /* lindex */
uv2 -= Hangul_VBase; /* vindex */
return(Hangul_SBase + (uv * Hangul_VCount + uv2) * Hangul_TCount);
}
- if(Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
+ if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
uv2 -= Hangul_TBase; /* tindex */
- return (uv + uv2);
+ return(uv + uv2);
}
plane = UNF_compos[uv >> 16];
- if(! plane) return 0;
+ if (! plane)
+ return 0;
row = plane[(uv >> 8) & 0xff];
- if(! row) return 0;
+ if (! row)
+ return 0;
cell = row[uv & 0xff];
- if(! cell) return 0;
- for(i = cell; i->nextchar; i++) {
- if(uv2 == i->nextchar) return i->composite;
+ if (! cell)
+ return 0;
+ for (i = cell; i->nextchar; i++) {
+ if (uv2 == i->nextchar)
+ return i->composite;
}
return 0;
}
U8 getCombinClass (UV uv)
{
U8 **plane, *row;
- if(OVER_UTF_MAX(uv)) return 0;
+ if (OVER_UTF_MAX(uv))
+ return 0;
plane = (U8**)UNF_combin[uv >> 16];
- if(! plane) return 0;
+ if (! plane)
+ return 0;
row = plane[(uv >> 8) & 0xff];
return row ? row[uv & 0xff] : 0;
}
void sv_cat_decompHangul (SV* sv, UV uv)
{
UV sindex, lindex, vindex, tindex;
- U8 *t, temp[3 * UTF8_MAXLEN + 1];
+ U8 *t, tmp[3 * UTF8_MAXLEN + 1];
- if(! Hangul_IsS(uv)) return;
+ if (! Hangul_IsS(uv))
+ return;
sindex = uv - Hangul_SBase;
lindex = sindex / Hangul_NCount;
vindex = (sindex % Hangul_NCount) / Hangul_TCount;
tindex = sindex % Hangul_TCount;
- t = temp;
+ t = tmp;
t = uvuni_to_utf8(t, (lindex + Hangul_LBase));
t = uvuni_to_utf8(t, (vindex + Hangul_VBase));
- if (tindex) t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
+ if (tindex)
+ t = uvuni_to_utf8(t, (tindex + Hangul_TBase));
*t = '\0';
- sv_catpvn(sv, (char *)temp, strlen((char *)temp));
+ sv_catpvn(sv, (char *)tmp, strlen((char *)tmp));
}
MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
-
SV*
-decompose(arg, compat)
+decompose(arg, compat = &PL_sv_no)
SV * arg
SV * compat
- PROTOTYPE: $
+ PROTOTYPE: $;$
PREINIT:
- SV *src, *dst;
- STRLEN srclen, dstlen, retlen;
- U8 *s, *e, *p, *d, *r;
UV uv;
+ SV *src, *dst;
+ STRLEN srclen, retlen;
+ U8 *s, *e, *p, *r;
bool iscompat;
CODE:
- if(SvUTF8(arg)) {
+ if (SvUTF8(arg)) {
src = arg;
} else {
src = sv_mortalcopy(arg);
sv_utf8_upgrade(src);
}
-
iscompat = SvTRUE(compat);
dst = newSV(1);
s = (U8*)SvPV(src,srclen);
e = s + srclen;
- for(p = s; p < e;){
- uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
+ for (p = s; p < e;) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
p += retlen;
- if(Hangul_IsS(uv)) sv_cat_decompHangul(dst, uv);
+ if (Hangul_IsS(uv))
+ sv_cat_decompHangul(dst, uv);
else {
r = iscompat ? dec_compat(uv) : dec_canonical(uv);
- if(r) sv_catpv(dst, (char *)r);
- else sv_catpvn(dst, (char *)p - retlen, retlen);
+ if (r)
+ sv_catpv(dst, (char *)r);
+ else
+ sv_catpvn(dst, (char *)p - retlen, retlen);
}
}
RETVAL = dst;
SV * arg
PROTOTYPE: $
PREINIT:
- SV *src;
- STRLEN srclen, retlen, stk_cc_max;
- U8 *s, *e, *p, curCC;
+ SV *src, *dst;
+ STRLEN srclen, dstlen, retlen, stk_cc_max;
+ U8 *s, *e, *p, *d, curCC;
UV uv;
UNF_cc * stk_cc;
CODE:
- src = newSVsv(arg);
- if(! SvUTF8(arg)) sv_utf8_upgrade(src);
+ if (SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+
+ s = (U8*)SvPV(src, srclen);
+
+ dstlen = srclen + 1;
+ dst = newSV(dstlen);
+ sv_setpvn(dst,(const char*)s,srclen);
+ SvUTF8_on(dst);
stk_cc_max = 10; /* enough as an initial value? */
New(0, stk_cc, stk_cc_max, UNF_cc);
- s = (U8*)SvPV(src,srclen);
- e = s + srclen;
- for(p = s; p < e;){
+ d = (U8*)SvPV(dst,dstlen);
+ e = d + dstlen;
+
+ for (p = d; p < e;) {
U8 *cc_in;
STRLEN cc_len, cc_iter, cc_pos;
- uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
- p += retlen;
- cc_pos = 0;
+ uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
curCC = getCombinClass(uv);
- if(! (curCC && p < e)) continue; else cc_in = p - retlen;
+ p += retlen;
+ if (! (curCC && p < e))
+ continue;
+ else
+ cc_in = p - retlen;
+
+ cc_pos = 0;
stk_cc[cc_pos].cc = curCC;
stk_cc[cc_pos].uv = uv;
stk_cc[cc_pos].pos = cc_pos;
- while(p < e) {
- uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
+ while (p < e) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
curCC = getCombinClass(uv);
- if(!curCC) break;
+ if (!curCC)
+ break;
p += retlen;
cc_pos++;
- if(stk_cc_max <= cc_pos) { /* extend if need */
+ if (stk_cc_max <= cc_pos) { /* extend if need */
stk_cc_max = cc_pos + 1;
Renew(stk_cc, stk_cc_max, UNF_cc);
}
}
/* only one c.c. in cc_len from cc_in, no need of reordering */
- if(!cc_pos) continue;
+ if (!cc_pos)
+ continue;
qsort((void*)stk_cc, cc_pos + 1, sizeof(UNF_cc), compare_cc);
cc_len = p - cc_in;
p = cc_in;
- for(cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
+ for (cc_iter = 0; cc_iter <= cc_pos; cc_iter++) {
p = uvuni_to_utf8(p, stk_cc[cc_iter].uv);
}
}
Safefree(stk_cc);
- RETVAL = src;
+ RETVAL = dst;
OUTPUT:
RETVAL
-void
+SV*
compose(arg)
SV * arg
PROTOTYPE: $
SV *src, *dst, *tmp;
U8 *s, *p, *e, *d, *t, *tmp_start, curCC, preCC;
UV uv, uvS, uvComp;
- STRLEN srclen, dstlen, tmplen, dstcur, retlen;
+ STRLEN srclen, dstlen, tmplen, retlen;
bool beginning = TRUE;
- PPCODE:
- if(SvUTF8(arg)) {
+ CODE:
+ if (SvUTF8(arg)) {
src = arg;
} else {
src = sv_mortalcopy(arg);
sv_utf8_upgrade(src);
}
+
s = (U8*)SvPV(src, srclen);
e = s + srclen;
- dstlen = srclen + 1; /* equal or shorter, XXX */
- dst = sv_2mortal(newSV(dstlen));
+ dstlen = srclen + 1;
+ dst = newSV(dstlen);
(void)SvPOK_only(dst);
SvUTF8_on(dst);
d = (U8*)SvPVX(dst);
(void)SvPOK_only(tmp);
SvUTF8_on(tmp);
- for(p = s; p < e;){
- if(beginning) {
- uvS = utf8n_to_uvchr(p, e - p, &retlen, 0);
+ for (p = s; p < e;) {
+ if (beginning) {
+ uvS = utf8n_to_uvuni(p, e - p, &retlen, 0);
p += retlen;
- if (getCombinClass(uvS)){ /* no Starter found yet */
+ if (getCombinClass(uvS)) { /* no Starter found yet */
d = uvuni_to_utf8(d, uvS);
continue;
}
preCC = 0;
/* to the next Starter */
- while(p < e) {
- uv = utf8n_to_uvchr(p, e - p, &retlen, 0);
+ while (p < e) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
p += retlen;
curCC = getCombinClass(uv);
- if(preCC && preCC == curCC) {
+ if (preCC && preCC == curCC) {
preCC = curCC;
t = uvuni_to_utf8(t, uv);
} else {
- uvComp = getComposite(uvS, uv);
-
- /* S + C + S => S-S + C would be also blocked. */
- if( uvComp && ! getExclusion(uvComp) && preCC <= curCC)
- {
+ uvComp = composite_uv(uvS, uv);
+
+ if (uvComp && ! isExclusion(uvComp) && preCC <= curCC) {
+ STRLEN leftcur, rightcur, dstcur;
+ leftcur = UNISKIP(uvComp);
+ rightcur = UNISKIP(uvS) + UNISKIP(uv);
+
+ if (leftcur > rightcur) {
+ dstcur = d - (U8*)SvPVX(dst);
+ dstlen += leftcur - rightcur;
+ d = (U8*)SvGROW(dst,dstlen) + dstcur;
+ }
/* preCC not changed to curCC */
uvS = uvComp;
- } else if (! curCC && p < e) { /* blocked */
+ } else if (! curCC && p < e) { /* blocked */
break;
} else {
preCC = curCC;
}
}
}
- d = uvuni_to_utf8(d, uvS); /* composed char */
- if(tmplen = t - tmp_start) { /* uncomposed combining char */
+ d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
+ tmplen = t - tmp_start;
+ if (tmplen) { /* uncomposed combining char */
t = (U8*)SvPVX(tmp);
- while(tmplen--) *d++ = *t++;
+ while (tmplen--)
+ *d++ = *t++;
}
uvS = uv;
} /* for */
- dstcur = d - (U8*)SvPVX(dst);
- SvCUR_set(dst, dstcur);
- XPUSHs(dst);
+ *d = '\0';
+ SvCUR_set(dst, d - (U8*)SvPVX(dst));
+ RETVAL = dst;
+ OUTPUT:
+ RETVAL
+
+
+
+void
+checkNFD(arg)
+ SV * arg
+ PROTOTYPE: $
+ ALIAS:
+ checkNFKD = 1
+ PREINIT:
+ UV uv;
+ SV *src;
+ STRLEN srclen, retlen;
+ U8 *s, *e, *p, curCC, preCC;
+ PPCODE:
+ if (SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+
+ s = (U8*)SvPV(src,srclen);
+ e = s + srclen;
+
+ preCC = 0;
+ for (p = s; p < e; p += retlen) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ curCC = getCombinClass(uv);
+ if (preCC > curCC && curCC != 0) /* canonical ordering violated */
+ XSRETURN_NO;
+ if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
+ XSRETURN_NO;
+ preCC = curCC;
+ }
+ XSRETURN_YES;
+
+
+
+void
+checkNFC(arg)
+ SV * arg
+ PROTOTYPE: $
+ ALIAS:
+ checkNFKC = 1
+ PREINIT:
+ UV uv;
+ SV *src;
+ STRLEN srclen, retlen;
+ U8 *s, *e, *p, curCC, preCC;
+ bool isMAYBE;
+ PPCODE:
+ if (SvUTF8(arg)) {
+ src = arg;
+ } else {
+ src = sv_mortalcopy(arg);
+ sv_utf8_upgrade(src);
+ }
+
+ s = (U8*)SvPV(src,srclen);
+ e = s + srclen;
+
+ preCC = 0;
+ isMAYBE = FALSE;
+ for (p = s; p < e; p += retlen) {
+ uv = utf8n_to_uvuni(p, e - p, &retlen, 0);
+ curCC = getCombinClass(uv);
+
+ if (preCC > curCC && curCC != 0) /* canonical ordering violated */
+ XSRETURN_NO;
+
+ /* get NFC/NFKC property */
+ if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
+ ; /* YES */
+ else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
+ XSRETURN_NO;
+ else if (isComp2nd(uv))
+ isMAYBE = TRUE;
+ else if (ix) {
+ char *canon, *compat;
+ /* NFKC_NO when having compatibility mapping. */
+ canon = (char *) dec_canonical(uv);
+ compat = (char *) dec_compat(uv);
+ if (compat && !(canon && strEQ(canon, compat)))
+ XSRETURN_NO;
+ } /* end of get NFC/NFKC property */
+
+ preCC = curCC;
+ }
+ if (isMAYBE)
+ XSRETURN_UNDEF;
+ else
+ XSRETURN_YES;
U8
getCombinClass(uv)
UV uv
+ PROTOTYPE: $
bool
-getExclusion(uv)
+isExclusion(uv)
UV uv
+ PROTOTYPE: $
-UV
+bool
+isSingleton(uv)
+ UV uv
+ PROTOTYPE: $
+
+bool
+isNonStDecomp(uv)
+ UV uv
+ PROTOTYPE: $
+
+bool
+isComp2nd(uv)
+ UV uv
+ PROTOTYPE: $
+ ALIAS:
+ isNFC_MAYBE = 1
+ isNFKC_MAYBE = 2
+
+
+
+void
+isNFD_NO(uv)
+ UV uv
+ PROTOTYPE: $
+ ALIAS:
+ isNFKD_NO = 1
+ PPCODE:
+ if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
+ XSRETURN_YES; /* NFD_NO or NFKD_NO */
+ else
+ XSRETURN_NO;
+
+
+
+void
+isComp_Ex(uv)
+ UV uv
+ PROTOTYPE: $
+ ALIAS:
+ isNFC_NO = 0
+ isNFKC_NO = 1
+ PPCODE:
+ if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
+ XSRETURN_YES; /* NFC_NO or NFKC_NO */
+ else if (ix) {
+ char *canon, *compat;
+ canon = (char *) dec_canonical(uv);
+ compat = (char *) dec_compat(uv);
+ if (compat && (!canon || strNE(canon, compat)))
+ XSRETURN_YES; /* NFC_NO or NFKC_NO */
+ else
+ XSRETURN_NO;
+ }
+ else
+ XSRETURN_NO;
+
+
+
+SV*
getComposite(uv, uv2)
UV uv
UV uv2
+ PROTOTYPE: $$
+ PREINIT:
+ UV composite;
+ CODE:
+ composite = composite_uv(uv, uv2);
+ RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
+ OUTPUT:
+ RETVAL
+
+
SV*
getCanon(uv)
PREINIT:
U8 * rstr;
CODE:
- if(Hangul_IsS(uv)) {
+ if (Hangul_IsS(uv)) {
SV * dst;
dst = newSV(1);
(void)SvPOK_only(dst);
RETVAL = dst;
} else {
rstr = ix ? dec_compat(uv) : dec_canonical(uv);
- if(!rstr) XSRETURN_UNDEF;
+ if (!rstr)
+ XSRETURN_UNDEF;
RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
}
SvUTF8_on(RETVAL);