This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Test file had been renamed. remove old cruft
[perl5.git] / cpan / Unicode-Collate / Collate.xs
CommitLineData
cd7f10f7
CBW
1
2#define PERL_NO_GET_CONTEXT /* we want efficiency */
3
4/* I guese no private function needs pTHX_ and aTHX_ */
5
f58b9ef1
CBW
6#include "EXTERN.h"
7#include "perl.h"
8#include "XSUB.h"
9
10/* This file is prepared by mkheader */
11#include "ucatbl.h"
12
d2309057 13/* Perl 5.6.1 ? */
f8187d97 14#ifdef utf8_to_uv
d2309057 15#define utf8n_to_uvuni utf8_to_uv
f8187d97 16#endif /* utf8_to_uv */
d2309057
CBW
17
18/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
19#ifndef UTF8_ALLOW_BOM
20#define UTF8_ALLOW_BOM (0)
21#endif /* UTF8_ALLOW_BOM */
22
23#ifndef UTF8_ALLOW_SURROGATE
24#define UTF8_ALLOW_SURROGATE (0)
25#endif /* UTF8_ALLOW_SURROGATE */
26
27#ifndef UTF8_ALLOW_FE_FF
28#define UTF8_ALLOW_FE_FF (0)
29#endif /* UTF8_ALLOW_FE_FF */
30
31#ifndef UTF8_ALLOW_FFFF
32#define UTF8_ALLOW_FFFF (0)
33#endif /* UTF8_ALLOW_FFFF */
34
35#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
36
37/* perl 5.6.x workaround, before 5.8.0 */
38#ifdef utf8n_to_uvuni
39#define GET_UV_FOR_5_6 utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF)
40#else
41#define GET_UV_FOR_5_6 retlen = 1 /* avoid an infinite loop */
42#endif /* utf8n_to_uvuni */
43
f58b9ef1
CBW
44/* At present, char > 0x10ffff are unaffected without complaint, right? */
45#define VALID_UTF_MAX (0x10ffff)
46#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
47
48static const UV max_div_16 = UV_MAX / 16;
49
50/* Supported Levels */
51#define MinLevel (1)
52#define MaxLevel (4)
53
54/* Shifted weight at 4th level */
55#define Shift4Wt (0xFFFF)
56
57#define VCE_Length (9)
58
59#define Hangul_SBase (0xAC00)
60#define Hangul_SIni (0xAC00)
61#define Hangul_SFin (0xD7A3)
62#define Hangul_NCount (588)
63#define Hangul_TCount (28)
64#define Hangul_LBase (0x1100)
65#define Hangul_LIni (0x1100)
66#define Hangul_LFin (0x1159)
67#define Hangul_LFill (0x115F)
68#define Hangul_LEnd (0x115F) /* Unicode 5.2 */
69#define Hangul_VBase (0x1161)
70#define Hangul_VIni (0x1160) /* from Vowel Filler */
71#define Hangul_VFin (0x11A2)
72#define Hangul_VEnd (0x11A7) /* Unicode 5.2 */
73#define Hangul_TBase (0x11A7) /* from "no-final" codepoint */
74#define Hangul_TIni (0x11A8)
75#define Hangul_TFin (0x11F9)
76#define Hangul_TEnd (0x11FF) /* Unicode 5.2 */
77#define HangulL2Ini (0xA960) /* Unicode 5.2 */
78#define HangulL2Fin (0xA97C) /* Unicode 5.2 */
79#define HangulV2Ini (0xD7B0) /* Unicode 5.2 */
80#define HangulV2Fin (0xD7C6) /* Unicode 5.2 */
81#define HangulT2Ini (0xD7CB) /* Unicode 5.2 */
82#define HangulT2Fin (0xD7FB) /* Unicode 5.2 */
83
84#define CJK_UidIni (0x4E00)
85#define CJK_UidFin (0x9FA5)
86#define CJK_UidF41 (0x9FBB)
87#define CJK_UidF51 (0x9FC3)
88#define CJK_UidF52 (0x9FCB)
cba8842c 89#define CJK_UidF61 (0x9FCC)
f58b9ef1
CBW
90#define CJK_ExtAIni (0x3400) /* Unicode 3.0 */
91#define CJK_ExtAFin (0x4DB5) /* Unicode 3.0 */
92#define CJK_ExtBIni (0x20000) /* Unicode 3.1 */
93#define CJK_ExtBFin (0x2A6D6) /* Unicode 3.1 */
94#define CJK_ExtCIni (0x2A700) /* Unicode 5.2 */
95#define CJK_ExtCFin (0x2B734) /* Unicode 5.2 */
96#define CJK_ExtDIni (0x2B740) /* Unicode 6.0 */
97#define CJK_ExtDFin (0x2B81D) /* Unicode 6.0 */
98
46267efc
CBW
99#define CJK_CompIni (0xFA0E)
100#define CJK_CompFin (0xFA29)
f58b9ef1
CBW
101static STDCHAR UnifiedCompat[] = {
102 1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1
103}; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */
104
105#define codeRange(bcode, ecode) ((bcode) <= code && code <= (ecode))
106
107MODULE = Unicode::Collate PACKAGE = Unicode::Collate
108
109PROTOTYPES: DISABLE
110
111void
112_fetch_rest ()
113 PREINIT:
114 char ** rest;
115 PPCODE:
116 for (rest = UCA_rest; *rest; ++rest) {
117 XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0)));
118 }
119
120
121void
122_fetch_simple (uv)
123 UV uv
124 PREINIT:
125 U8 ***plane, **row;
126 U8* result = NULL;
127 PPCODE:
128 if (!OVER_UTF_MAX(uv)){
129 plane = (U8***)UCA_simple[uv >> 16];
130 if (plane) {
131 row = plane[(uv >> 8) & 0xff];
132 result = row ? row[uv & 0xff] : NULL;
133 }
134 }
135 if (result) {
136 int i;
137 int num = (int)*result;
138 ++result;
139 for (i = 0; i < num; ++i) {
140 XPUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length)));
141 result += VCE_Length;
142 }
143 } else {
144 XPUSHs(sv_2mortal(newSViv(0)));
145 }
146
147SV*
148_ignorable_simple (uv)
149 UV uv
150 ALIAS:
151 _exists_simple = 1
152 PREINIT:
153 U8 ***plane, **row;
154 int num = -1;
155 U8* result = NULL;
156 CODE:
157 if (!OVER_UTF_MAX(uv)){
158 plane = (U8***)UCA_simple[uv >> 16];
159 if (plane) {
160 row = plane[(uv >> 8) & 0xff];
161 result = row ? row[uv & 0xff] : NULL;
162 }
163 if (result)
164 num = (int)*result; /* assuming 0 <= num < 128 */
165 }
166
167 if (ix)
168 RETVAL = boolSV(num >0);
169 else
170 RETVAL = boolSV(num==0);
171 OUTPUT:
172 RETVAL
173
174
175void
176_getHexArray (src)
177 SV* src
178 PREINIT:
179 char *s, *e;
180 STRLEN byte;
181 UV value;
182 bool overflowed = FALSE;
183 const char *hexdigit;
184 PPCODE:
185 s = SvPV(src,byte);
186 for (e = s + byte; s < e;) {
187 hexdigit = strchr((char *) PL_hexdigit, *s++);
188 if (! hexdigit)
189 continue;
190 value = (hexdigit - PL_hexdigit) & 0xF;
191 while (*s) {
192 hexdigit = strchr((char *) PL_hexdigit, *s++);
193 if (! hexdigit)
194 break;
195 if (overflowed)
196 continue;
197 if (value > max_div_16) {
198 overflowed = TRUE;
199 continue;
200 }
201 value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF);
202 }
203 XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value)));
204 }
205
206
207SV*
208_isIllegal (sv)
209 SV* sv
210 PREINIT:
211 UV uv;
212 CODE:
213 if (!sv || !SvIOK(sv))
214 XSRETURN_YES;
215 uv = SvUVX(sv);
216 RETVAL = boolSV(
60f577e0 217 0x10FFFF < uv /* out of range */
4779e03e 218 || ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] */
f58b9ef1
CBW
219 || (0xD800 <= uv && uv <= 0xDFFF) /* unpaired surrogates */
220 || (0xFDD0 <= uv && uv <= 0xFDEF) /* other non-characters */
221 );
222OUTPUT:
223 RETVAL
224
225
226void
227_decompHangul (code)
228 UV code
229 PREINIT:
230 UV sindex, lindex, vindex, tindex;
231 PPCODE:
232 /* code *must* be in Hangul syllable.
233 * Check it before you enter here. */
234 sindex = code - Hangul_SBase;
235 lindex = sindex / Hangul_NCount;
236 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
237 tindex = sindex % Hangul_TCount;
238
239 XPUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase)));
240 XPUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase)));
241 if (tindex)
242 XPUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase)));
243
244
245SV*
246getHST (code, uca_vers = 0)
247 UV code;
248 IV uca_vers;
249 PREINIT:
0cb4637e 250 const char * hangtype;
f58b9ef1
CBW
251 STRLEN typelen;
252 CODE:
253 if (codeRange(Hangul_SIni, Hangul_SFin)) {
254 if ((code - Hangul_SBase) % Hangul_TCount) {
255 hangtype = "LVT"; typelen = 3;
256 } else {
257 hangtype = "LV"; typelen = 2;
258 }
259 } else if (uca_vers < 20) {
260 if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) {
261 hangtype = "L"; typelen = 1;
262 } else if (codeRange(Hangul_VIni, Hangul_VFin)) {
263 hangtype = "V"; typelen = 1;
264 } else if (codeRange(Hangul_TIni, Hangul_TFin)) {
265 hangtype = "T"; typelen = 1;
266 } else {
267 hangtype = ""; typelen = 0;
268 }
269 } else {
270 if (codeRange(Hangul_LIni, Hangul_LEnd) ||
271 codeRange(HangulL2Ini, HangulL2Fin)) {
272 hangtype = "L"; typelen = 1;
273 } else if (codeRange(Hangul_VIni, Hangul_VEnd) ||
274 codeRange(HangulV2Ini, HangulV2Fin)) {
275 hangtype = "V"; typelen = 1;
276 } else if (codeRange(Hangul_TIni, Hangul_TEnd) ||
277 codeRange(HangulT2Ini, HangulT2Fin)) {
278 hangtype = "T"; typelen = 1;
279 } else {
280 hangtype = ""; typelen = 0;
281 }
282 }
283
284 RETVAL = newSVpvn(hangtype, typelen);
285OUTPUT:
286 RETVAL
287
288
289void
290_derivCE_9 (code)
291 UV code
292 ALIAS:
293 _derivCE_14 = 1
294 _derivCE_18 = 2
295 _derivCE_20 = 3
296 _derivCE_22 = 4
cba8842c 297 _derivCE_24 = 5
f58b9ef1
CBW
298 PREINIT:
299 UV base, aaaa, bbbb;
300 U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF";
301 U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF";
302 bool basic_unified = 0;
303 PPCODE:
304 if (CJK_UidIni <= code) {
46267efc
CBW
305 if (codeRange(CJK_CompIni, CJK_CompFin))
306 basic_unified = (bool)UnifiedCompat[code - CJK_CompIni];
f58b9ef1 307 else
cba8842c
A
308 basic_unified = (ix >= 5 ? (code <= CJK_UidF61) :
309 ix >= 3 ? (code <= CJK_UidF52) :
f58b9ef1
CBW
310 ix == 2 ? (code <= CJK_UidF51) :
311 ix == 1 ? (code <= CJK_UidF41) :
312 (code <= CJK_UidFin));
313 }
314 base = (basic_unified)
315 ? 0xFB40 : /* CJK */
316 ((codeRange(CJK_ExtAIni, CJK_ExtAFin))
317 ||
318 (codeRange(CJK_ExtBIni, CJK_ExtBFin))
319 ||
320 (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
321 ||
322 (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin)))
323 ? 0xFB80 /* CJK ext. */
324 : 0xFBC0; /* others */
325 aaaa = base + (code >> 15);
326 bbbb = (code & 0x7FFF) | 0x8000;
327 a[1] = (U8)(aaaa >> 8);
328 a[2] = (U8)(aaaa & 0xFF);
329 b[1] = (U8)(bbbb >> 8);
330 b[2] = (U8)(bbbb & 0xFF);
331 a[7] = b[7] = (U8)(code >> 8);
332 a[8] = b[8] = (U8)(code & 0xFF);
333 XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
334 XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
335
336
337void
338_derivCE_8 (code)
339 UV code
340 PREINIT:
341 UV aaaa, bbbb;
342 U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x02\x00\x01\xFF\xFF";
343 U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF";
344 PPCODE:
345 aaaa = 0xFF80 + (code >> 15);
346 bbbb = (code & 0x7FFF) | 0x8000;
347 a[1] = (U8)(aaaa >> 8);
348 a[2] = (U8)(aaaa & 0xFF);
349 b[1] = (U8)(bbbb >> 8);
350 b[2] = (U8)(bbbb & 0xFF);
351 a[7] = b[7] = (U8)(code >> 8);
352 a[8] = b[8] = (U8)(code & 0xFF);
353 XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
354 XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
355
356
357void
358_uideoCE_8 (code)
359 UV code
360 PREINIT:
361 U8 uice[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF";
362 PPCODE:
363 uice[1] = uice[7] = (U8)(code >> 8);
364 uice[2] = uice[8] = (U8)(code & 0xFF);
365 XPUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length)));
366
367
368SV*
369_isUIdeo (code, uca_vers)
370 UV code;
371 IV uca_vers;
372 bool basic_unified = 0;
373 CODE:
374 /* uca_vers = 0 for _uideoCE_8() */
375 if (CJK_UidIni <= code) {
46267efc
CBW
376 if (codeRange(CJK_CompIni, CJK_CompFin))
377 basic_unified = (bool)UnifiedCompat[code - CJK_CompIni];
f58b9ef1 378 else
cba8842c
A
379 basic_unified = (uca_vers >= 24 ? (code <= CJK_UidF61) :
380 uca_vers >= 20 ? (code <= CJK_UidF52) :
f58b9ef1
CBW
381 uca_vers >= 18 ? (code <= CJK_UidF51) :
382 uca_vers >= 14 ? (code <= CJK_UidF41) :
383 (code <= CJK_UidFin));
384 }
385 RETVAL = boolSV(
386 (basic_unified)
387 ||
388 (codeRange(CJK_ExtAIni, CJK_ExtAFin))
389 ||
390 (uca_vers >= 8 && codeRange(CJK_ExtBIni, CJK_ExtBFin))
391 ||
392 (uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
393 ||
394 (uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin))
395 );
396OUTPUT:
397 RETVAL
398
399
400SV*
401mk_SortKey (self, buf)
402 SV* self;
403 SV* buf;
404 PREINIT:
405 SV *dst, **svp;
406 STRLEN dlen, vlen;
407 U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel];
408 AV *bufAV;
409 HV *selfHV;
410 UV back_flag;
411 I32 i, buf_len;
412 IV lv, level, uca_vers;
413 bool upper_lower, kata_hira, v2i, last_is_var;
414 CODE:
415 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
416 selfHV = (HV*)SvRV(self);
417 else
418 croak("$self is not a HASHREF.");
419
f58b9ef1
CBW
420 if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV)
421 bufAV = (AV*)SvRV(buf);
422 else
423 croak("XSUB, not an ARRAYREF.");
424
425 buf_len = av_len(bufAV);
426
427 if (buf_len < 0) { /* empty: -1 */
428 dlen = 2 * (MaxLevel - 1);
429 dst = newSV(dlen);
430 (void)SvPOK_only(dst);
431 d = (U8*)SvPVX(dst);
432 while (dlen--)
433 *d++ = '\0';
19265284
CBW
434 } else {
435 svp = hv_fetch(selfHV, "level", 5, FALSE);
436 level = svp ? SvIV(*svp) : MaxLevel;
437
f58b9ef1
CBW
438 for (lv = 0; lv < level; lv++) {
439 New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8);
440 s[lv] = eachlevel[lv];
441 }
442
443 svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE);
444 upper_lower = svp ? SvTRUE(*svp) : FALSE;
445 svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE);
446 kata_hira = svp ? SvTRUE(*svp) : FALSE;
447 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
448 uca_vers = SvIV(*svp);
449 svp = hv_fetch(selfHV, "variable", 8, FALSE);
450 v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */
451 ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13))
452 : FALSE;
453
454 last_is_var = FALSE;
455 for (i = 0; i <= buf_len; i++) {
456 svp = av_fetch(bufAV, i, FALSE);
457
458 if (svp && SvPOK(*svp))
459 v = (U8*)SvPV(*svp, vlen);
460 else
461 croak("not a vwt.");
462
463 if (vlen < VCE_Length) /* ignore short VCE (unexpected) */
464 continue;
465
466 /* "Ignorable (L1, L2) after Variable" since track. v. 9 */
467 if (v2i) {
468 if (*v)
469 last_is_var = TRUE;
470 else if (v[1] || v[2]) /* non zero primary weight */
471 last_is_var = FALSE;
472 else if (last_is_var) /* zero primary weight; skipped */
473 continue;
474 }
475
476 if (v[5] == 0) { /* tert wt < 256 */
477 if (upper_lower) {
478 if (0x8 <= v[6] && v[6] <= 0xC) /* lower */
479 v[6] -= 6;
480 else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */
481 v[6] += 6;
482 else if (v[6] == 0x1C) /* square upper */
483 v[6]++;
484 else if (v[6] == 0x1D) /* square lower */
485 v[6]--;
486 }
487 if (kata_hira) {
488 if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */
489 v[6] -= 2;
490 else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */
491 v[6] += 5;
492 }
493 }
494
495 for (lv = 0; lv < level; lv++) {
496 if (v[2 * lv + 1] || v[2 * lv + 2]) {
497 *s[lv]++ = v[2 * lv + 1];
498 *s[lv]++ = v[2 * lv + 2];
499 }
500 }
501 }
502
503 dlen = 2 * (MaxLevel - 1);
504 for (lv = 0; lv < level; lv++)
505 dlen += s[lv] - eachlevel[lv];
506
507 dst = newSV(dlen);
508 (void)SvPOK_only(dst);
509 d = (U8*)SvPVX(dst);
510
511 svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE);
512 back_flag = svp ? SvUV(*svp) : (UV)0;
513
514 for (lv = 0; lv < level; lv++) {
515 if (back_flag & (1 << (lv + 1))) {
516 p = s[lv];
517 e = eachlevel[lv];
518 for ( ; e < p; p -= 2) {
519 *d++ = p[-2];
520 *d++ = p[-1];
521 }
522 }
523 else {
524 p = eachlevel[lv];
525 e = s[lv];
526 while (p < e)
527 *d++ = *p++;
528 }
529 if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
530 *d++ = '\0';
531 *d++ = '\0';
532 }
533 }
534
535 for (lv = level; lv < MaxLevel; lv++) {
536 if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
537 *d++ = '\0';
538 *d++ = '\0';
539 }
540 }
541
542 for (lv = 0; lv < level; lv++) {
543 Safefree(eachlevel[lv]);
544 }
545 }
546 *d = '\0';
547 SvCUR_set(dst, d - (U8*)SvPVX(dst));
548 RETVAL = dst;
549OUTPUT:
550 RETVAL
551
552
553SV*
19265284
CBW
554varCE (self, vce)
555 SV* self;
556 SV* vce;
f58b9ef1 557 PREINIT:
19265284
CBW
558 SV *dst, *vbl, **svp;
559 HV *selfHV;
f58b9ef1
CBW
560 U8 *a, *v, *d;
561 STRLEN alen, vlen;
19265284
CBW
562 bool ig_l2;
563 UV totwt;
f58b9ef1 564 CODE:
19265284
CBW
565 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
566 selfHV = (HV*)SvRV(self);
567 else
568 croak("$self is not a HASHREF.");
569
570 svp = hv_fetch(selfHV, "ignore_level2", 13, FALSE);
571 ig_l2 = svp ? SvTRUE(*svp) : FALSE;
572
573 svp = hv_fetch(selfHV, "variable", 8, FALSE);
574 vbl = svp ? *svp : &PL_sv_no;
f58b9ef1
CBW
575 a = (U8*)SvPV(vbl, alen);
576 v = (U8*)SvPV(vce, vlen);
577
578 dst = newSV(vlen);
579 d = (U8*)SvPVX(dst);
580 (void)SvPOK_only(dst);
581 Copy(v, d, vlen, U8);
582 SvCUR_set(dst, vlen);
583 d[vlen] = '\0';
584
19265284
CBW
585 /* primary weight == 0 && secondary weight != 0 */
586 if (ig_l2 && !d[1] && !d[2] && (d[3] || d[4])) {
587 d[3] = d[4] = d[5] = d[6] = '\0';
588 }
589
f58b9ef1
CBW
590 /* variable: checked only the first char and the length,
591 trusting checkCollator() and %VariableOK in Perl ... */
592
f8187d97
SH
593 if (vlen >= VCE_Length && *a != 'n') {
594 if (*v) {
595 if (*a == 's') { /* shifted or shift-trimmed */
750da838
CBW
596 d[7] = d[1]; /* wt level 1 to 4 */
597 d[8] = d[2];
f8187d97
SH
598 } /* else blanked */
599 d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
600 } else if (*a == 's') { /* shifted or shift-trimmed */
601 totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
602 if (alen == 7 && totwt != 0) { /* shifted */
603 if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */
604 d[7] = d[1]; /* wt level 1 to 4 */
605 d[8] = d[2];
606 } else {
607 d[7] = (U8)(Shift4Wt >> 8);
608 d[8] = (U8)(Shift4Wt & 0xFF);
609 }
610 } else { /* shift-trimmed or completely ignorable */
611 d[7] = d[8] = '\0';
750da838 612 }
f8187d97
SH
613 } /* else blanked */
614 } /* else non-ignorable */
f58b9ef1
CBW
615 RETVAL = dst;
616OUTPUT:
617 RETVAL
618
619
620
621SV*
622visualizeSortKey (self, key)
623 SV * self
624 SV * key
625 PREINIT:
626 HV *selfHV;
627 SV **svp, *dst;
628 U8 *s, *e, *d;
629 STRLEN klen, dlen;
630 UV uv;
750da838 631 IV uca_vers, sep = 0;
0cb4637e 632 static const char *upperhex = "0123456789ABCDEF";
f58b9ef1
CBW
633 CODE:
634 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
635 selfHV = (HV*)SvRV(self);
636 else
637 croak("$self is not a HASHREF.");
638
639 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
640 if (!svp)
641 croak("Panic: no $self->{UCA_Version} in visualizeSortKey");
642 uca_vers = SvIV(*svp);
643
644 s = (U8*)SvPV(key, klen);
645
646 /* slightly *longer* than the need, but I'm afraid of miscounting;
750da838
CBW
647 = (klen / 2) * 5 - 1
648 # FFFF and ' ' for each 16bit units but ' ' is less by 1;
649 # ' ' and '|' for level boundaries including the identical level
650 + 2 # '[' and ']'
651 + 1 # '\0'
652 (a) if klen is odd (not expected), maybe more 5 bytes.
653 (b) there is not always the identical level.
f58b9ef1
CBW
654 */
655 dlen = (klen / 2) * 5 + MaxLevel * 2 + 2;
656 dst = newSV(dlen);
657 (void)SvPOK_only(dst);
658 d = (U8*)SvPVX(dst);
659
660 *d++ = '[';
661 for (e = s + klen; s < e; s += 2) {
662 uv = (U16)(*s << 8 | s[1]);
750da838 663 if (uv || sep >= MaxLevel) {
f58b9ef1
CBW
664 if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|')))
665 *d++ = ' ';
666 *d++ = upperhex[ (s[0] >> 4) & 0xF ];
667 *d++ = upperhex[ s[0] & 0xF ];
668 *d++ = upperhex[ (s[1] >> 4) & 0xF ];
669 *d++ = upperhex[ s[1] & 0xF ];
750da838 670 } else {
f58b9ef1
CBW
671 if ((9 <= uca_vers) && (d[-1] != '['))
672 *d++ = ' ';
673 *d++ = '|';
750da838 674 ++sep;
f58b9ef1
CBW
675 }
676 }
677 *d++ = ']';
678 *d = '\0';
679 SvCUR_set(dst, d - (U8*)SvPVX(dst));
680 RETVAL = dst;
681OUTPUT:
682 RETVAL
683
d2309057
CBW
684
685
686void
687unpackUfor56 (src)
688 SV* src
689 PREINIT:
690 STRLEN srclen, retlen;
691 U8 *s, *p, *e;
692 UV uv;
693 PPCODE:
694 s = (U8*)SvPV(src,srclen);
695 if (!SvUTF8(src)) {
696 SV* tmpsv = sv_mortalcopy(src);
697 if (!SvPOK(tmpsv))
698 (void)sv_pvn_force(tmpsv,&srclen);
699 sv_utf8_upgrade(tmpsv);
700 s = (U8*)SvPV(tmpsv,srclen);
701 }
702 e = s + srclen;
703
704 for (p = s; p < e; p += retlen) {
705 uv = GET_UV_FOR_5_6; /* perl 5.6.x workaround */
706 if (!retlen)
707 croak("panic (Unicode::Collate): zero-length character");
708 XPUSHs(sv_2mortal(newSVuv(uv)));
709 }
710