5 /* This file is prepared by mkheader */
10 #define utf8n_to_uvuni utf8_to_uv
11 #endif /* utf8n_to_uvuni */
13 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
14 #ifndef UTF8_ALLOW_BOM
15 #define UTF8_ALLOW_BOM (0)
16 #endif /* UTF8_ALLOW_BOM */
18 #ifndef UTF8_ALLOW_SURROGATE
19 #define UTF8_ALLOW_SURROGATE (0)
20 #endif /* UTF8_ALLOW_SURROGATE */
22 #ifndef UTF8_ALLOW_FE_FF
23 #define UTF8_ALLOW_FE_FF (0)
24 #endif /* UTF8_ALLOW_FE_FF */
26 #ifndef UTF8_ALLOW_FFFF
27 #define UTF8_ALLOW_FFFF (0)
28 #endif /* UTF8_ALLOW_FFFF */
30 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
32 /* if utf8n_to_uvuni() sets retlen to 0 (?) */
33 #define ErrRetlenIsZero "panic (Unicode::Collate): zero-length character"
35 /* At present, char > 0x10ffff are unaffected without complaint, right? */
36 #define VALID_UTF_MAX (0x10ffff)
37 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
39 static const UV max_div_16 = UV_MAX / 16;
41 /* Supported Levels */
45 /* Shifted weight at 4th level */
46 #define Shift4Wt (0xFFFF)
48 #define VCE_Length (9)
50 #define Hangul_SBase (0xAC00)
51 #define Hangul_SIni (0xAC00)
52 #define Hangul_SFin (0xD7A3)
53 #define Hangul_NCount (588)
54 #define Hangul_TCount (28)
55 #define Hangul_LBase (0x1100)
56 #define Hangul_LIni (0x1100)
57 #define Hangul_LFin (0x1159)
58 #define Hangul_LFill (0x115F)
59 #define Hangul_LEnd (0x115F) /* Unicode 5.2 */
60 #define Hangul_VBase (0x1161)
61 #define Hangul_VIni (0x1160) /* from Vowel Filler */
62 #define Hangul_VFin (0x11A2)
63 #define Hangul_VEnd (0x11A7) /* Unicode 5.2 */
64 #define Hangul_TBase (0x11A7) /* from "no-final" codepoint */
65 #define Hangul_TIni (0x11A8)
66 #define Hangul_TFin (0x11F9)
67 #define Hangul_TEnd (0x11FF) /* Unicode 5.2 */
68 #define HangulL2Ini (0xA960) /* Unicode 5.2 */
69 #define HangulL2Fin (0xA97C) /* Unicode 5.2 */
70 #define HangulV2Ini (0xD7B0) /* Unicode 5.2 */
71 #define HangulV2Fin (0xD7C6) /* Unicode 5.2 */
72 #define HangulT2Ini (0xD7CB) /* Unicode 5.2 */
73 #define HangulT2Fin (0xD7FB) /* Unicode 5.2 */
75 #define CJK_UidIni (0x4E00)
76 #define CJK_UidFin (0x9FA5)
77 #define CJK_UidF41 (0x9FBB)
78 #define CJK_UidF51 (0x9FC3)
79 #define CJK_UidF52 (0x9FCB)
80 #define CJK_ExtAIni (0x3400) /* Unicode 3.0 */
81 #define CJK_ExtAFin (0x4DB5) /* Unicode 3.0 */
82 #define CJK_ExtBIni (0x20000) /* Unicode 3.1 */
83 #define CJK_ExtBFin (0x2A6D6) /* Unicode 3.1 */
84 #define CJK_ExtCIni (0x2A700) /* Unicode 5.2 */
85 #define CJK_ExtCFin (0x2B734) /* Unicode 5.2 */
86 #define CJK_ExtDIni (0x2B740) /* Unicode 6.0 */
87 #define CJK_ExtDFin (0x2B81D) /* Unicode 6.0 */
89 static STDCHAR UnifiedCompat[] = {
90 1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1
91 }; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */
93 #define codeRange(bcode, ecode) ((bcode) <= code && code <= (ecode))
95 MODULE = Unicode::Collate PACKAGE = Unicode::Collate
104 for (rest = UCA_rest; *rest; ++rest) {
105 XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0)));
116 if (!OVER_UTF_MAX(uv)){
117 plane = (U8***)UCA_simple[uv >> 16];
119 row = plane[(uv >> 8) & 0xff];
120 result = row ? row[uv & 0xff] : NULL;
125 int num = (int)*result;
127 for (i = 0; i < num; ++i) {
128 XPUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length)));
129 result += VCE_Length;
132 XPUSHs(sv_2mortal(newSViv(0)));
136 _ignorable_simple (uv)
145 if (!OVER_UTF_MAX(uv)){
146 plane = (U8***)UCA_simple[uv >> 16];
148 row = plane[(uv >> 8) & 0xff];
149 result = row ? row[uv & 0xff] : NULL;
152 num = (int)*result; /* assuming 0 <= num < 128 */
156 RETVAL = boolSV(num >0);
158 RETVAL = boolSV(num==0);
170 bool overflowed = FALSE;
171 const char *hexdigit;
174 for (e = s + byte; s < e;) {
175 hexdigit = strchr((char *) PL_hexdigit, *s++);
178 value = (hexdigit - PL_hexdigit) & 0xF;
180 hexdigit = strchr((char *) PL_hexdigit, *s++);
185 if (value > max_div_16) {
189 value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF);
191 XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value)));
201 if (!sv || !SvIOK(sv))
205 0x10FFFF < uv /* out of range */
217 /* should be called only if ! _isIllegal(sv). */
220 ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] (cf. utf8.c) */
221 || (0xD800 <= uv && uv <= 0xDFFF) /* unpaired surrogates */
222 || (0xFDD0 <= uv && uv <= 0xFDEF) /* other non-characters */
232 UV sindex, lindex, vindex, tindex;
234 /* code *must* be in Hangul syllable.
235 * Check it before you enter here. */
236 sindex = code - Hangul_SBase;
237 lindex = sindex / Hangul_NCount;
238 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
239 tindex = sindex % Hangul_TCount;
241 XPUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase)));
242 XPUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase)));
244 XPUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase)));
248 getHST (code, uca_vers = 0)
255 if (codeRange(Hangul_SIni, Hangul_SFin)) {
256 if ((code - Hangul_SBase) % Hangul_TCount) {
257 hangtype = "LVT"; typelen = 3;
259 hangtype = "LV"; typelen = 2;
261 } else if (uca_vers < 20) {
262 if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) {
263 hangtype = "L"; typelen = 1;
264 } else if (codeRange(Hangul_VIni, Hangul_VFin)) {
265 hangtype = "V"; typelen = 1;
266 } else if (codeRange(Hangul_TIni, Hangul_TFin)) {
267 hangtype = "T"; typelen = 1;
269 hangtype = ""; typelen = 0;
272 if (codeRange(Hangul_LIni, Hangul_LEnd) ||
273 codeRange(HangulL2Ini, HangulL2Fin)) {
274 hangtype = "L"; typelen = 1;
275 } else if (codeRange(Hangul_VIni, Hangul_VEnd) ||
276 codeRange(HangulV2Ini, HangulV2Fin)) {
277 hangtype = "V"; typelen = 1;
278 } else if (codeRange(Hangul_TIni, Hangul_TEnd) ||
279 codeRange(HangulT2Ini, HangulT2Fin)) {
280 hangtype = "T"; typelen = 1;
282 hangtype = ""; typelen = 0;
286 RETVAL = newSVpvn(hangtype, typelen);
301 U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF";
302 U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF";
303 bool basic_unified = 0;
305 if (CJK_UidIni <= code) {
306 if (codeRange(0xFA0E, 0xFA29))
307 basic_unified = (bool)UnifiedCompat[code - 0xFA0E];
309 basic_unified = (ix >= 3 ? (code <= CJK_UidF52) :
310 ix == 2 ? (code <= CJK_UidF51) :
311 ix == 1 ? (code <= CJK_UidF41) :
312 (code <= CJK_UidFin));
314 base = (basic_unified)
316 ((codeRange(CJK_ExtAIni, CJK_ExtAFin))
318 (codeRange(CJK_ExtBIni, CJK_ExtBFin))
320 (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
322 (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin)))
323 ? 0xFB80 /* CJK ext. */
324 : 0xFBC0; /* others */
325 aaaa = base + (code >> 15);
326 bbbb = (code & 0x7FFF) | 0x8000;
327 a[1] = (U8)(aaaa >> 8);
328 a[2] = (U8)(aaaa & 0xFF);
329 b[1] = (U8)(bbbb >> 8);
330 b[2] = (U8)(bbbb & 0xFF);
331 a[7] = b[7] = (U8)(code >> 8);
332 a[8] = b[8] = (U8)(code & 0xFF);
333 XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
334 XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
342 U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x02\x00\x01\xFF\xFF";
343 U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF";
345 aaaa = 0xFF80 + (code >> 15);
346 bbbb = (code & 0x7FFF) | 0x8000;
347 a[1] = (U8)(aaaa >> 8);
348 a[2] = (U8)(aaaa & 0xFF);
349 b[1] = (U8)(bbbb >> 8);
350 b[2] = (U8)(bbbb & 0xFF);
351 a[7] = b[7] = (U8)(code >> 8);
352 a[8] = b[8] = (U8)(code & 0xFF);
353 XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
354 XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
361 U8 uice[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF";
363 uice[1] = uice[7] = (U8)(code >> 8);
364 uice[2] = uice[8] = (U8)(code & 0xFF);
365 XPUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length)));
369 _isUIdeo (code, uca_vers)
372 bool basic_unified = 0;
374 /* uca_vers = 0 for _uideoCE_8() */
375 if (CJK_UidIni <= code) {
376 if (codeRange(0xFA0E, 0xFA29))
377 basic_unified = (bool)UnifiedCompat[code - 0xFA0E];
379 basic_unified = (uca_vers >= 20 ? (code <= CJK_UidF52) :
380 uca_vers >= 18 ? (code <= CJK_UidF51) :
381 uca_vers >= 14 ? (code <= CJK_UidF41) :
382 (code <= CJK_UidFin));
387 (codeRange(CJK_ExtAIni, CJK_ExtAFin))
389 (uca_vers >= 8 && codeRange(CJK_ExtBIni, CJK_ExtBFin))
391 (uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
393 (uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin))
400 mk_SortKey (self, buf)
406 U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel];
411 IV lv, level, uca_vers;
412 bool upper_lower, kata_hira, v2i, last_is_var;
414 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
415 selfHV = (HV*)SvRV(self);
417 croak("$self is not a HASHREF.");
419 svp = hv_fetch(selfHV, "level", 5, FALSE);
420 level = svp ? SvIV(*svp) : MaxLevel;
422 if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV)
423 bufAV = (AV*)SvRV(buf);
425 croak("XSUB, not an ARRAYREF.");
427 buf_len = av_len(bufAV);
429 if (buf_len < 0) { /* empty: -1 */
430 dlen = 2 * (MaxLevel - 1);
432 (void)SvPOK_only(dst);
438 for (lv = 0; lv < level; lv++) {
439 New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8);
440 s[lv] = eachlevel[lv];
443 svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE);
444 upper_lower = svp ? SvTRUE(*svp) : FALSE;
445 svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE);
446 kata_hira = svp ? SvTRUE(*svp) : FALSE;
447 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
448 uca_vers = SvIV(*svp);
449 svp = hv_fetch(selfHV, "variable", 8, FALSE);
450 v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */
451 ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13))
455 for (i = 0; i <= buf_len; i++) {
456 svp = av_fetch(bufAV, i, FALSE);
458 if (svp && SvPOK(*svp))
459 v = (U8*)SvPV(*svp, vlen);
463 if (vlen < VCE_Length) /* ignore short VCE (unexpected) */
466 /* "Ignorable (L1, L2) after Variable" since track. v. 9 */
470 else if (v[1] || v[2]) /* non zero primary weight */
472 else if (last_is_var) /* zero primary weight; skipped */
476 if (v[5] == 0) { /* tert wt < 256 */
478 if (0x8 <= v[6] && v[6] <= 0xC) /* lower */
480 else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */
482 else if (v[6] == 0x1C) /* square upper */
484 else if (v[6] == 0x1D) /* square lower */
488 if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */
490 else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */
495 for (lv = 0; lv < level; lv++) {
496 if (v[2 * lv + 1] || v[2 * lv + 2]) {
497 *s[lv]++ = v[2 * lv + 1];
498 *s[lv]++ = v[2 * lv + 2];
503 dlen = 2 * (MaxLevel - 1);
504 for (lv = 0; lv < level; lv++)
505 dlen += s[lv] - eachlevel[lv];
508 (void)SvPOK_only(dst);
511 svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE);
512 back_flag = svp ? SvUV(*svp) : (UV)0;
514 for (lv = 0; lv < level; lv++) {
515 if (back_flag & (1 << (lv + 1))) {
518 for ( ; e < p; p -= 2) {
529 if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
535 for (lv = level; lv < MaxLevel; lv++) {
536 if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
542 for (lv = 0; lv < level; lv++) {
543 Safefree(eachlevel[lv]);
547 SvCUR_set(dst, d - (U8*)SvPVX(dst));
562 a = (U8*)SvPV(vbl, alen);
563 v = (U8*)SvPV(vce, vlen);
567 (void)SvPOK_only(dst);
568 Copy(v, d, vlen, U8);
569 SvCUR_set(dst, vlen);
572 /* variable: checked only the first char and the length,
573 trusting checkCollator() and %VariableOK in Perl ... */
575 if (vlen < VCE_Length /* ignore short VCE (unexpected) */
577 *a == 'n') /* 'non-ignorable' */
580 if (*a == 's') { /* shifted or shift-trimmed */
581 d[7] = d[1]; /* wt level 1 to 4 */
584 d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
586 else if (*a == 'b') /* blanked */
588 else if (*a == 's') { /* shifted or shift-trimmed */
589 if (alen == 7 && (d[1] + d[2] + d[3] + d[4] + d[5] + d[6])) {
590 d[7] = (U8)(Shift4Wt >> 8);
591 d[8] = (U8)(Shift4Wt & 0xFF);
598 croak("unknown variable value '%s'", a);
606 visualizeSortKey (self, key)
616 static char *upperhex = "0123456789ABCDEF";
618 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
619 selfHV = (HV*)SvRV(self);
621 croak("$self is not a HASHREF.");
623 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
625 croak("Panic: no $self->{UCA_Version} in visualizeSortKey");
626 uca_vers = SvIV(*svp);
628 s = (U8*)SvPV(key, klen);
630 /* slightly *longer* than the need, but I'm afraid of miscounting;
631 exactly: (klen / 2) * 5 + MaxLevel * 2 - 1 (excluding '\0')
632 = (klen / 2) * 5 - 1 # FFFF (16bit) and ' ' between 16bit units
633 + (MaxLevel - 1) * 2 # ' ' and '|' for level boundaries
636 dlen = (klen / 2) * 5 + MaxLevel * 2 + 2;
638 (void)SvPOK_only(dst);
642 for (e = s + klen; s < e; s += 2) {
643 uv = (U16)(*s << 8 | s[1]);
645 if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|')))
647 *d++ = upperhex[ (s[0] >> 4) & 0xF ];
648 *d++ = upperhex[ s[0] & 0xF ];
649 *d++ = upperhex[ (s[1] >> 4) & 0xF ];
650 *d++ = upperhex[ s[1] & 0xF ];
653 if ((9 <= uca_vers) && (d[-1] != '['))
660 SvCUR_set(dst, d - (U8*)SvPVX(dst));
671 STRLEN srclen, retlen;
675 s = (U8*)SvPV(src,srclen);
677 SV* tmpsv = sv_mortalcopy(src);
679 (void)sv_pvn_force(tmpsv,&srclen);
680 sv_utf8_upgrade(tmpsv);
681 s = (U8*)SvPV(tmpsv,srclen);
685 for (p = s; p < e; p += retlen) {
686 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
688 croak(ErrRetlenIsZero);
689 XPUSHs(sv_2mortal(newSVuv(uv)));