2 #define PERL_NO_GET_CONTEXT /* we want efficiency */
4 /* I guese no private function needs pTHX_ and aTHX_ */
10 /* This file is prepared by mkheader */
13 /* At present, char > 0x10ffff are unaffected without complaint, right? */
14 #define VALID_UTF_MAX (0x10ffff)
15 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
17 #define MAX_DIV_16 (UV_MAX / 16)
19 /* Supported Levels */
23 /* Shifted weight at 4th level */
24 #define Shift4Wt (0xFFFF)
26 #define VCE_Length (9)
28 #define Hangul_SBase (0xAC00)
29 #define Hangul_SIni (0xAC00)
30 #define Hangul_SFin (0xD7A3)
31 #define Hangul_NCount (588)
32 #define Hangul_TCount (28)
33 #define Hangul_LBase (0x1100)
34 #define Hangul_LIni (0x1100)
35 #define Hangul_LFin (0x1159)
36 #define Hangul_LFill (0x115F)
37 #define Hangul_LEnd (0x115F) /* Unicode 5.2 */
38 #define Hangul_VBase (0x1161)
39 #define Hangul_VIni (0x1160) /* from Vowel Filler */
40 #define Hangul_VFin (0x11A2)
41 #define Hangul_VEnd (0x11A7) /* Unicode 5.2 */
42 #define Hangul_TBase (0x11A7) /* from "no-final" codepoint */
43 #define Hangul_TIni (0x11A8)
44 #define Hangul_TFin (0x11F9)
45 #define Hangul_TEnd (0x11FF) /* Unicode 5.2 */
46 #define HangulL2Ini (0xA960) /* Unicode 5.2 */
47 #define HangulL2Fin (0xA97C) /* Unicode 5.2 */
48 #define HangulV2Ini (0xD7B0) /* Unicode 5.2 */
49 #define HangulV2Fin (0xD7C6) /* Unicode 5.2 */
50 #define HangulT2Ini (0xD7CB) /* Unicode 5.2 */
51 #define HangulT2Fin (0xD7FB) /* Unicode 5.2 */
53 #define CJK_UidIni (0x4E00)
54 #define CJK_UidFin (0x9FA5)
55 #define CJK_UidF41 (0x9FBB)
56 #define CJK_UidF51 (0x9FC3)
57 #define CJK_UidF52 (0x9FCB)
58 #define CJK_UidF61 (0x9FCC)
59 #define CJK_ExtAIni (0x3400) /* Unicode 3.0 */
60 #define CJK_ExtAFin (0x4DB5) /* Unicode 3.0 */
61 #define CJK_ExtBIni (0x20000) /* Unicode 3.1 */
62 #define CJK_ExtBFin (0x2A6D6) /* Unicode 3.1 */
63 #define CJK_ExtCIni (0x2A700) /* Unicode 5.2 */
64 #define CJK_ExtCFin (0x2B734) /* Unicode 5.2 */
65 #define CJK_ExtDIni (0x2B740) /* Unicode 6.0 */
66 #define CJK_ExtDFin (0x2B81D) /* Unicode 6.0 */
68 #define CJK_CompIni (0xFA0E)
69 #define CJK_CompFin (0xFA29)
70 static const STDCHAR UnifiedCompat[] = {
71 1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1
72 }; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */
74 #define codeRange(bcode, ecode) ((bcode) <= code && code <= (ecode))
76 MODULE = Unicode::Collate PACKAGE = Unicode::Collate
85 for (rest = (char **)UCA_rest; *rest; ++rest) {
86 XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0)));
97 if (!OVER_UTF_MAX(uv)){
98 plane = (U8***)UCA_simple[uv >> 16];
100 row = plane[(uv >> 8) & 0xff];
101 result = row ? row[uv & 0xff] : NULL;
106 int num = (int)*result;
109 for (i = 0; i < num; ++i) {
110 PUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length)));
111 result += VCE_Length;
114 PUSHs(sv_2mortal(newSViv(0)));
118 _ignorable_simple (uv)
127 if (!OVER_UTF_MAX(uv)){
128 plane = (U8***)UCA_simple[uv >> 16];
130 row = plane[(uv >> 8) & 0xff];
131 result = row ? row[uv & 0xff] : NULL;
134 num = (int)*result; /* assuming 0 <= num < 128 */
138 RETVAL = boolSV(num >0);
140 RETVAL = boolSV(num==0);
152 bool overflowed = FALSE;
153 const char *hexdigit;
156 for (e = s + byte; s < e;) {
157 hexdigit = strchr((char *) PL_hexdigit, *s++);
160 value = (hexdigit - PL_hexdigit) & 0xF;
162 hexdigit = strchr((char *) PL_hexdigit, *s++);
167 if (value > MAX_DIV_16) {
171 value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF);
173 XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value)));
183 if (!sv || !SvIOK(sv))
187 0x10FFFF < uv /* out of range */
188 || ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] */
189 || (0xD800 <= uv && uv <= 0xDFFF) /* unpaired surrogates */
190 || (0xFDD0 <= uv && uv <= 0xFDEF) /* other non-characters */
200 UV sindex, lindex, vindex, tindex;
202 /* code *must* be in Hangul syllable.
203 * Check it before you enter here. */
204 sindex = code - Hangul_SBase;
205 lindex = sindex / Hangul_NCount;
206 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
207 tindex = sindex % Hangul_TCount;
209 EXTEND(SP, tindex ? 3 : 2);
210 PUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase)));
211 PUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase)));
213 PUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase)));
217 getHST (code, uca_vers = 0)
221 const char * hangtype;
224 if (codeRange(Hangul_SIni, Hangul_SFin)) {
225 if ((code - Hangul_SBase) % Hangul_TCount) {
226 hangtype = "LVT"; typelen = 3;
228 hangtype = "LV"; typelen = 2;
230 } else if (uca_vers < 20) {
231 if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) {
232 hangtype = "L"; typelen = 1;
233 } else if (codeRange(Hangul_VIni, Hangul_VFin)) {
234 hangtype = "V"; typelen = 1;
235 } else if (codeRange(Hangul_TIni, Hangul_TFin)) {
236 hangtype = "T"; typelen = 1;
238 hangtype = ""; typelen = 0;
241 if (codeRange(Hangul_LIni, Hangul_LEnd) ||
242 codeRange(HangulL2Ini, HangulL2Fin)) {
243 hangtype = "L"; typelen = 1;
244 } else if (codeRange(Hangul_VIni, Hangul_VEnd) ||
245 codeRange(HangulV2Ini, HangulV2Fin)) {
246 hangtype = "V"; typelen = 1;
247 } else if (codeRange(Hangul_TIni, Hangul_TEnd) ||
248 codeRange(HangulT2Ini, HangulT2Fin)) {
249 hangtype = "T"; typelen = 1;
251 hangtype = ""; typelen = 0;
255 RETVAL = newSVpvn(hangtype, typelen);
271 U8 a[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
272 U8 b[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
273 bool basic_unified = 0;
275 if (CJK_UidIni <= code) {
276 if (codeRange(CJK_CompIni, CJK_CompFin))
277 basic_unified = (bool)UnifiedCompat[code - CJK_CompIni];
279 basic_unified = (ix >= 5 ? (code <= CJK_UidF61) :
280 ix >= 3 ? (code <= CJK_UidF52) :
281 ix == 2 ? (code <= CJK_UidF51) :
282 ix == 1 ? (code <= CJK_UidF41) :
283 (code <= CJK_UidFin));
285 base = (basic_unified)
287 ((codeRange(CJK_ExtAIni, CJK_ExtAFin))
289 (codeRange(CJK_ExtBIni, CJK_ExtBFin))
291 (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
293 (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin)))
294 ? 0xFB80 /* CJK ext. */
295 : 0xFBC0; /* others */
296 aaaa = base + (code >> 15);
297 bbbb = (code & 0x7FFF) | 0x8000;
298 a[1] = (U8)(aaaa >> 8);
299 a[2] = (U8)(aaaa & 0xFF);
300 b[1] = (U8)(bbbb >> 8);
301 b[2] = (U8)(bbbb & 0xFF);
302 a[4] = (U8)(0x20); /* second octet of level 2 */
303 a[6] = (U8)(0x02); /* second octet of level 3 */
304 a[7] = b[7] = (U8)(code >> 8);
305 a[8] = b[8] = (U8)(code & 0xFF);
307 PUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
308 PUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
316 U8 a[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
317 U8 b[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
319 aaaa = 0xFF80 + (code >> 15);
320 bbbb = (code & 0x7FFF) | 0x8000;
321 a[1] = (U8)(aaaa >> 8);
322 a[2] = (U8)(aaaa & 0xFF);
323 b[1] = (U8)(bbbb >> 8);
324 b[2] = (U8)(bbbb & 0xFF);
325 a[4] = (U8)(0x02); /* second octet of level 2 */
326 a[6] = (U8)(0x01); /* second octet of level 3 */
327 a[7] = b[7] = (U8)(code >> 8);
328 a[8] = b[8] = (U8)(code & 0xFF);
330 PUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
331 PUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
338 U8 uice[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
340 uice[1] = uice[7] = (U8)(code >> 8);
341 uice[2] = uice[8] = (U8)(code & 0xFF);
342 uice[4] = (U8)(0x20); /* second octet of level 2 */
343 uice[6] = (U8)(0x02); /* second octet of level 3 */
344 PUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length)));
348 _isUIdeo (code, uca_vers)
351 bool basic_unified = 0;
353 /* uca_vers = 0 for _uideoCE_8() */
354 if (CJK_UidIni <= code) {
355 if (codeRange(CJK_CompIni, CJK_CompFin))
356 basic_unified = (bool)UnifiedCompat[code - CJK_CompIni];
358 basic_unified = (uca_vers >= 24 ? (code <= CJK_UidF61) :
359 uca_vers >= 20 ? (code <= CJK_UidF52) :
360 uca_vers >= 18 ? (code <= CJK_UidF51) :
361 uca_vers >= 14 ? (code <= CJK_UidF41) :
362 (code <= CJK_UidFin));
367 (codeRange(CJK_ExtAIni, CJK_ExtAFin))
369 (uca_vers >= 8 && codeRange(CJK_ExtBIni, CJK_ExtBFin))
371 (uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
373 (uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin))
380 mk_SortKey (self, buf)
386 U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel];
391 IV lv, level, uca_vers;
392 bool upper_lower, kata_hira, v2i, last_is_var;
394 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
395 selfHV = (HV*)SvRV(self);
397 croak("$self is not a HASHREF.");
399 if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV)
400 bufAV = (AV*)SvRV(buf);
402 croak("XSUB, not an ARRAYREF.");
404 buf_len = av_len(bufAV);
406 if (buf_len < 0) { /* empty: -1 */
407 dlen = 2 * (MaxLevel - 1);
409 (void)SvPOK_only(dst);
414 svp = hv_fetch(selfHV, "level", 5, FALSE);
415 level = svp ? SvIV(*svp) : MaxLevel;
417 for (lv = 0; lv < level; lv++) {
418 New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8);
419 s[lv] = eachlevel[lv];
422 svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE);
423 upper_lower = svp ? SvTRUE(*svp) : FALSE;
424 svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE);
425 kata_hira = svp ? SvTRUE(*svp) : FALSE;
426 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
427 uca_vers = SvIV(*svp);
428 svp = hv_fetch(selfHV, "variable", 8, FALSE);
429 v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */
430 ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13))
434 for (i = 0; i <= buf_len; i++) {
435 svp = av_fetch(bufAV, i, FALSE);
437 if (svp && SvPOK(*svp))
438 v = (U8*)SvPV(*svp, vlen);
442 if (vlen < VCE_Length) /* ignore short VCE (unexpected) */
445 /* "Ignorable (L1, L2) after Variable" since track. v. 9 */
449 else if (v[1] || v[2]) /* non zero primary weight */
451 else if (last_is_var) /* zero primary weight; skipped */
455 if (v[5] == 0) { /* tert wt < 256 */
457 if (0x8 <= v[6] && v[6] <= 0xC) /* lower */
459 else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */
461 else if (v[6] == 0x1C) /* square upper */
463 else if (v[6] == 0x1D) /* square lower */
467 if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */
469 else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */
474 for (lv = 0; lv < level; lv++) {
475 if (v[2 * lv + 1] || v[2 * lv + 2]) {
476 *s[lv]++ = v[2 * lv + 1];
477 *s[lv]++ = v[2 * lv + 2];
482 dlen = 2 * (MaxLevel - 1);
483 for (lv = 0; lv < level; lv++)
484 dlen += s[lv] - eachlevel[lv];
487 (void)SvPOK_only(dst);
490 svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE);
491 back_flag = svp ? SvUV(*svp) : (UV)0;
493 for (lv = 0; lv < level; lv++) {
494 if (back_flag & (1 << (lv + 1))) {
497 for ( ; e < p; p -= 2) {
508 if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
514 for (lv = level; lv < MaxLevel; lv++) {
515 if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
521 for (lv = 0; lv < level; lv++) {
522 Safefree(eachlevel[lv]);
526 SvCUR_set(dst, d - (U8*)SvPVX(dst));
537 SV *dst, *vbl, **svp;
544 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
545 selfHV = (HV*)SvRV(self);
547 croak("$self is not a HASHREF.");
549 svp = hv_fetch(selfHV, "ignore_level2", 13, FALSE);
550 ig_l2 = svp ? SvTRUE(*svp) : FALSE;
552 svp = hv_fetch(selfHV, "variable", 8, FALSE);
553 vbl = svp ? *svp : &PL_sv_no;
554 a = (U8*)SvPV(vbl, alen);
555 v = (U8*)SvPV(vce, vlen);
559 (void)SvPOK_only(dst);
560 Copy(v, d, vlen, U8);
561 SvCUR_set(dst, vlen);
564 /* primary weight == 0 && secondary weight != 0 */
565 if (ig_l2 && !d[1] && !d[2] && (d[3] || d[4])) {
566 d[3] = d[4] = d[5] = d[6] = '\0';
569 /* variable: checked only the first char and the length,
570 trusting checkCollator() and %VariableOK in Perl ... */
572 if (vlen >= VCE_Length && *a != 'n') {
574 if (*a == 's') { /* shifted or shift-trimmed */
575 d[7] = d[1]; /* wt level 1 to 4 */
578 d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
579 } else if (*a == 's') { /* shifted or shift-trimmed */
580 totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
581 if (alen == 7 && totwt != 0) { /* shifted */
582 if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */
583 d[7] = d[1]; /* wt level 1 to 4 */
586 d[7] = (U8)(Shift4Wt >> 8);
587 d[8] = (U8)(Shift4Wt & 0xFF);
589 } else { /* shift-trimmed or completely ignorable */
593 } /* else non-ignorable */
601 visualizeSortKey (self, key)
610 IV uca_vers, sep = 0;
611 const char *upperhex = "0123456789ABCDEF";
613 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
614 selfHV = (HV*)SvRV(self);
616 croak("$self is not a HASHREF.");
618 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
620 croak("Panic: no $self->{UCA_Version} in visualizeSortKey");
621 uca_vers = SvIV(*svp);
623 s = (U8*)SvPV(key, klen);
625 /* slightly *longer* than the need, but I'm afraid of miscounting;
627 # FFFF and ' ' for each 16bit units but ' ' is less by 1;
628 # ' ' and '|' for level boundaries including the identical level
631 (a) if klen is odd (not expected), maybe more 5 bytes.
632 (b) there is not always the identical level.
634 dlen = (klen / 2) * 5 + MaxLevel * 2 + 2;
636 (void)SvPOK_only(dst);
640 for (e = s + klen; s < e; s += 2) {
641 uv = (U16)(*s << 8 | s[1]);
642 if (uv || sep >= MaxLevel) {
643 if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|')))
645 *d++ = upperhex[ (s[0] >> 4) & 0xF ];
646 *d++ = upperhex[ s[0] & 0xF ];
647 *d++ = upperhex[ (s[1] >> 4) & 0xF ];
648 *d++ = upperhex[ s[1] & 0xF ];
650 if ((9 <= uca_vers) && (d[-1] != '['))
658 SvCUR_set(dst, d - (U8*)SvPVX(dst));