This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Updated Unicode-Collate to CPAN version 0.76
[perl5.git] / cpan / Unicode-Collate / Collate.xs
CommitLineData
f58b9ef1
CBW
1#include "EXTERN.h"
2#include "perl.h"
3#include "XSUB.h"
4
5/* This file is prepared by mkheader */
6#include "ucatbl.h"
7
8/* Perl 5.6.1 ? */
9#ifndef utf8n_to_uvuni
10#define utf8n_to_uvuni utf8_to_uv
11#endif /* utf8n_to_uvuni */
12
13/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
14#ifndef UTF8_ALLOW_BOM
15#define UTF8_ALLOW_BOM (0)
16#endif /* UTF8_ALLOW_BOM */
17
18#ifndef UTF8_ALLOW_SURROGATE
19#define UTF8_ALLOW_SURROGATE (0)
20#endif /* UTF8_ALLOW_SURROGATE */
21
22#ifndef UTF8_ALLOW_FE_FF
23#define UTF8_ALLOW_FE_FF (0)
24#endif /* UTF8_ALLOW_FE_FF */
25
26#ifndef UTF8_ALLOW_FFFF
27#define UTF8_ALLOW_FFFF (0)
28#endif /* UTF8_ALLOW_FFFF */
29
30#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
31
32/* if utf8n_to_uvuni() sets retlen to 0 (?) */
33#define ErrRetlenIsZero "panic (Unicode::Collate): zero-length character"
34
35/* At present, char > 0x10ffff are unaffected without complaint, right? */
36#define VALID_UTF_MAX (0x10ffff)
37#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
38
39static const UV max_div_16 = UV_MAX / 16;
40
41/* Supported Levels */
42#define MinLevel (1)
43#define MaxLevel (4)
44
45/* Shifted weight at 4th level */
46#define Shift4Wt (0xFFFF)
47
48#define VCE_Length (9)
49
50#define Hangul_SBase (0xAC00)
51#define Hangul_SIni (0xAC00)
52#define Hangul_SFin (0xD7A3)
53#define Hangul_NCount (588)
54#define Hangul_TCount (28)
55#define Hangul_LBase (0x1100)
56#define Hangul_LIni (0x1100)
57#define Hangul_LFin (0x1159)
58#define Hangul_LFill (0x115F)
59#define Hangul_LEnd (0x115F) /* Unicode 5.2 */
60#define Hangul_VBase (0x1161)
61#define Hangul_VIni (0x1160) /* from Vowel Filler */
62#define Hangul_VFin (0x11A2)
63#define Hangul_VEnd (0x11A7) /* Unicode 5.2 */
64#define Hangul_TBase (0x11A7) /* from "no-final" codepoint */
65#define Hangul_TIni (0x11A8)
66#define Hangul_TFin (0x11F9)
67#define Hangul_TEnd (0x11FF) /* Unicode 5.2 */
68#define HangulL2Ini (0xA960) /* Unicode 5.2 */
69#define HangulL2Fin (0xA97C) /* Unicode 5.2 */
70#define HangulV2Ini (0xD7B0) /* Unicode 5.2 */
71#define HangulV2Fin (0xD7C6) /* Unicode 5.2 */
72#define HangulT2Ini (0xD7CB) /* Unicode 5.2 */
73#define HangulT2Fin (0xD7FB) /* Unicode 5.2 */
74
75#define CJK_UidIni (0x4E00)
76#define CJK_UidFin (0x9FA5)
77#define CJK_UidF41 (0x9FBB)
78#define CJK_UidF51 (0x9FC3)
79#define CJK_UidF52 (0x9FCB)
80#define CJK_ExtAIni (0x3400) /* Unicode 3.0 */
81#define CJK_ExtAFin (0x4DB5) /* Unicode 3.0 */
82#define CJK_ExtBIni (0x20000) /* Unicode 3.1 */
83#define CJK_ExtBFin (0x2A6D6) /* Unicode 3.1 */
84#define CJK_ExtCIni (0x2A700) /* Unicode 5.2 */
85#define CJK_ExtCFin (0x2B734) /* Unicode 5.2 */
86#define CJK_ExtDIni (0x2B740) /* Unicode 6.0 */
87#define CJK_ExtDFin (0x2B81D) /* Unicode 6.0 */
88
89static STDCHAR UnifiedCompat[] = {
90 1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1
91}; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */
92
93#define codeRange(bcode, ecode) ((bcode) <= code && code <= (ecode))
94
95MODULE = Unicode::Collate PACKAGE = Unicode::Collate
96
97PROTOTYPES: DISABLE
98
99void
100_fetch_rest ()
101 PREINIT:
102 char ** rest;
103 PPCODE:
104 for (rest = UCA_rest; *rest; ++rest) {
105 XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0)));
106 }
107
108
109void
110_fetch_simple (uv)
111 UV uv
112 PREINIT:
113 U8 ***plane, **row;
114 U8* result = NULL;
115 PPCODE:
116 if (!OVER_UTF_MAX(uv)){
117 plane = (U8***)UCA_simple[uv >> 16];
118 if (plane) {
119 row = plane[(uv >> 8) & 0xff];
120 result = row ? row[uv & 0xff] : NULL;
121 }
122 }
123 if (result) {
124 int i;
125 int num = (int)*result;
126 ++result;
127 for (i = 0; i < num; ++i) {
128 XPUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length)));
129 result += VCE_Length;
130 }
131 } else {
132 XPUSHs(sv_2mortal(newSViv(0)));
133 }
134
135SV*
136_ignorable_simple (uv)
137 UV uv
138 ALIAS:
139 _exists_simple = 1
140 PREINIT:
141 U8 ***plane, **row;
142 int num = -1;
143 U8* result = NULL;
144 CODE:
145 if (!OVER_UTF_MAX(uv)){
146 plane = (U8***)UCA_simple[uv >> 16];
147 if (plane) {
148 row = plane[(uv >> 8) & 0xff];
149 result = row ? row[uv & 0xff] : NULL;
150 }
151 if (result)
152 num = (int)*result; /* assuming 0 <= num < 128 */
153 }
154
155 if (ix)
156 RETVAL = boolSV(num >0);
157 else
158 RETVAL = boolSV(num==0);
159 OUTPUT:
160 RETVAL
161
162
163void
164_getHexArray (src)
165 SV* src
166 PREINIT:
167 char *s, *e;
168 STRLEN byte;
169 UV value;
170 bool overflowed = FALSE;
171 const char *hexdigit;
172 PPCODE:
173 s = SvPV(src,byte);
174 for (e = s + byte; s < e;) {
175 hexdigit = strchr((char *) PL_hexdigit, *s++);
176 if (! hexdigit)
177 continue;
178 value = (hexdigit - PL_hexdigit) & 0xF;
179 while (*s) {
180 hexdigit = strchr((char *) PL_hexdigit, *s++);
181 if (! hexdigit)
182 break;
183 if (overflowed)
184 continue;
185 if (value > max_div_16) {
186 overflowed = TRUE;
187 continue;
188 }
189 value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF);
190 }
191 XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value)));
192 }
193
194
195SV*
196_isIllegal (sv)
197 SV* sv
198 PREINIT:
199 UV uv;
200 CODE:
201 if (!sv || !SvIOK(sv))
202 XSRETURN_YES;
203 uv = SvUVX(sv);
204 RETVAL = boolSV(
205 0x10FFFF < uv /* out of range */
206 );
207OUTPUT:
208 RETVAL
209
210
211SV*
212_isNonchar (sv)
213 SV* sv
214 PREINIT:
215 UV uv;
216 CODE:
217 /* should be called only if ! _isIllegal(sv). */
218 uv = SvUVX(sv);
219 RETVAL = boolSV(
220 ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] (cf. utf8.c) */
221 || (0xD800 <= uv && uv <= 0xDFFF) /* unpaired surrogates */
222 || (0xFDD0 <= uv && uv <= 0xFDEF) /* other non-characters */
223 );
224OUTPUT:
225 RETVAL
226
227
228void
229_decompHangul (code)
230 UV code
231 PREINIT:
232 UV sindex, lindex, vindex, tindex;
233 PPCODE:
234 /* code *must* be in Hangul syllable.
235 * Check it before you enter here. */
236 sindex = code - Hangul_SBase;
237 lindex = sindex / Hangul_NCount;
238 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
239 tindex = sindex % Hangul_TCount;
240
241 XPUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase)));
242 XPUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase)));
243 if (tindex)
244 XPUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase)));
245
246
247SV*
248getHST (code, uca_vers = 0)
249 UV code;
250 IV uca_vers;
251 PREINIT:
252 char * hangtype;
253 STRLEN typelen;
254 CODE:
255 if (codeRange(Hangul_SIni, Hangul_SFin)) {
256 if ((code - Hangul_SBase) % Hangul_TCount) {
257 hangtype = "LVT"; typelen = 3;
258 } else {
259 hangtype = "LV"; typelen = 2;
260 }
261 } else if (uca_vers < 20) {
262 if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) {
263 hangtype = "L"; typelen = 1;
264 } else if (codeRange(Hangul_VIni, Hangul_VFin)) {
265 hangtype = "V"; typelen = 1;
266 } else if (codeRange(Hangul_TIni, Hangul_TFin)) {
267 hangtype = "T"; typelen = 1;
268 } else {
269 hangtype = ""; typelen = 0;
270 }
271 } else {
272 if (codeRange(Hangul_LIni, Hangul_LEnd) ||
273 codeRange(HangulL2Ini, HangulL2Fin)) {
274 hangtype = "L"; typelen = 1;
275 } else if (codeRange(Hangul_VIni, Hangul_VEnd) ||
276 codeRange(HangulV2Ini, HangulV2Fin)) {
277 hangtype = "V"; typelen = 1;
278 } else if (codeRange(Hangul_TIni, Hangul_TEnd) ||
279 codeRange(HangulT2Ini, HangulT2Fin)) {
280 hangtype = "T"; typelen = 1;
281 } else {
282 hangtype = ""; typelen = 0;
283 }
284 }
285
286 RETVAL = newSVpvn(hangtype, typelen);
287OUTPUT:
288 RETVAL
289
290
291void
292_derivCE_9 (code)
293 UV code
294 ALIAS:
295 _derivCE_14 = 1
296 _derivCE_18 = 2
297 _derivCE_20 = 3
298 _derivCE_22 = 4
299 PREINIT:
300 UV base, aaaa, bbbb;
301 U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF";
302 U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF";
303 bool basic_unified = 0;
304 PPCODE:
305 if (CJK_UidIni <= code) {
306 if (codeRange(0xFA0E, 0xFA29))
307 basic_unified = (bool)UnifiedCompat[code - 0xFA0E];
308 else
309 basic_unified = (ix >= 3 ? (code <= CJK_UidF52) :
310 ix == 2 ? (code <= CJK_UidF51) :
311 ix == 1 ? (code <= CJK_UidF41) :
312 (code <= CJK_UidFin));
313 }
314 base = (basic_unified)
315 ? 0xFB40 : /* CJK */
316 ((codeRange(CJK_ExtAIni, CJK_ExtAFin))
317 ||
318 (codeRange(CJK_ExtBIni, CJK_ExtBFin))
319 ||
320 (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
321 ||
322 (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin)))
323 ? 0xFB80 /* CJK ext. */
324 : 0xFBC0; /* others */
325 aaaa = base + (code >> 15);
326 bbbb = (code & 0x7FFF) | 0x8000;
327 a[1] = (U8)(aaaa >> 8);
328 a[2] = (U8)(aaaa & 0xFF);
329 b[1] = (U8)(bbbb >> 8);
330 b[2] = (U8)(bbbb & 0xFF);
331 a[7] = b[7] = (U8)(code >> 8);
332 a[8] = b[8] = (U8)(code & 0xFF);
333 XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
334 XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
335
336
337void
338_derivCE_8 (code)
339 UV code
340 PREINIT:
341 UV aaaa, bbbb;
342 U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x02\x00\x01\xFF\xFF";
343 U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF";
344 PPCODE:
345 aaaa = 0xFF80 + (code >> 15);
346 bbbb = (code & 0x7FFF) | 0x8000;
347 a[1] = (U8)(aaaa >> 8);
348 a[2] = (U8)(aaaa & 0xFF);
349 b[1] = (U8)(bbbb >> 8);
350 b[2] = (U8)(bbbb & 0xFF);
351 a[7] = b[7] = (U8)(code >> 8);
352 a[8] = b[8] = (U8)(code & 0xFF);
353 XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
354 XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
355
356
357void
358_uideoCE_8 (code)
359 UV code
360 PREINIT:
361 U8 uice[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF";
362 PPCODE:
363 uice[1] = uice[7] = (U8)(code >> 8);
364 uice[2] = uice[8] = (U8)(code & 0xFF);
365 XPUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length)));
366
367
368SV*
369_isUIdeo (code, uca_vers)
370 UV code;
371 IV uca_vers;
372 bool basic_unified = 0;
373 CODE:
374 /* uca_vers = 0 for _uideoCE_8() */
375 if (CJK_UidIni <= code) {
376 if (codeRange(0xFA0E, 0xFA29))
377 basic_unified = (bool)UnifiedCompat[code - 0xFA0E];
378 else
379 basic_unified = (uca_vers >= 20 ? (code <= CJK_UidF52) :
380 uca_vers >= 18 ? (code <= CJK_UidF51) :
381 uca_vers >= 14 ? (code <= CJK_UidF41) :
382 (code <= CJK_UidFin));
383 }
384 RETVAL = boolSV(
385 (basic_unified)
386 ||
387 (codeRange(CJK_ExtAIni, CJK_ExtAFin))
388 ||
389 (uca_vers >= 8 && codeRange(CJK_ExtBIni, CJK_ExtBFin))
390 ||
391 (uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
392 ||
393 (uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin))
394 );
395OUTPUT:
396 RETVAL
397
398
399SV*
400mk_SortKey (self, buf)
401 SV* self;
402 SV* buf;
403 PREINIT:
404 SV *dst, **svp;
405 STRLEN dlen, vlen;
406 U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel];
407 AV *bufAV;
408 HV *selfHV;
409 UV back_flag;
410 I32 i, buf_len;
411 IV lv, level, uca_vers;
412 bool upper_lower, kata_hira, v2i, last_is_var;
413 CODE:
414 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
415 selfHV = (HV*)SvRV(self);
416 else
417 croak("$self is not a HASHREF.");
418
f58b9ef1
CBW
419 if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV)
420 bufAV = (AV*)SvRV(buf);
421 else
422 croak("XSUB, not an ARRAYREF.");
423
424 buf_len = av_len(bufAV);
425
426 if (buf_len < 0) { /* empty: -1 */
427 dlen = 2 * (MaxLevel - 1);
428 dst = newSV(dlen);
429 (void)SvPOK_only(dst);
430 d = (U8*)SvPVX(dst);
431 while (dlen--)
432 *d++ = '\0';
19265284
CBW
433 } else {
434 svp = hv_fetch(selfHV, "level", 5, FALSE);
435 level = svp ? SvIV(*svp) : MaxLevel;
436
f58b9ef1
CBW
437 for (lv = 0; lv < level; lv++) {
438 New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8);
439 s[lv] = eachlevel[lv];
440 }
441
442 svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE);
443 upper_lower = svp ? SvTRUE(*svp) : FALSE;
444 svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE);
445 kata_hira = svp ? SvTRUE(*svp) : FALSE;
446 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
447 uca_vers = SvIV(*svp);
448 svp = hv_fetch(selfHV, "variable", 8, FALSE);
449 v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */
450 ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13))
451 : FALSE;
452
453 last_is_var = FALSE;
454 for (i = 0; i <= buf_len; i++) {
455 svp = av_fetch(bufAV, i, FALSE);
456
457 if (svp && SvPOK(*svp))
458 v = (U8*)SvPV(*svp, vlen);
459 else
460 croak("not a vwt.");
461
462 if (vlen < VCE_Length) /* ignore short VCE (unexpected) */
463 continue;
464
465 /* "Ignorable (L1, L2) after Variable" since track. v. 9 */
466 if (v2i) {
467 if (*v)
468 last_is_var = TRUE;
469 else if (v[1] || v[2]) /* non zero primary weight */
470 last_is_var = FALSE;
471 else if (last_is_var) /* zero primary weight; skipped */
472 continue;
473 }
474
475 if (v[5] == 0) { /* tert wt < 256 */
476 if (upper_lower) {
477 if (0x8 <= v[6] && v[6] <= 0xC) /* lower */
478 v[6] -= 6;
479 else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */
480 v[6] += 6;
481 else if (v[6] == 0x1C) /* square upper */
482 v[6]++;
483 else if (v[6] == 0x1D) /* square lower */
484 v[6]--;
485 }
486 if (kata_hira) {
487 if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */
488 v[6] -= 2;
489 else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */
490 v[6] += 5;
491 }
492 }
493
494 for (lv = 0; lv < level; lv++) {
495 if (v[2 * lv + 1] || v[2 * lv + 2]) {
496 *s[lv]++ = v[2 * lv + 1];
497 *s[lv]++ = v[2 * lv + 2];
498 }
499 }
500 }
501
502 dlen = 2 * (MaxLevel - 1);
503 for (lv = 0; lv < level; lv++)
504 dlen += s[lv] - eachlevel[lv];
505
506 dst = newSV(dlen);
507 (void)SvPOK_only(dst);
508 d = (U8*)SvPVX(dst);
509
510 svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE);
511 back_flag = svp ? SvUV(*svp) : (UV)0;
512
513 for (lv = 0; lv < level; lv++) {
514 if (back_flag & (1 << (lv + 1))) {
515 p = s[lv];
516 e = eachlevel[lv];
517 for ( ; e < p; p -= 2) {
518 *d++ = p[-2];
519 *d++ = p[-1];
520 }
521 }
522 else {
523 p = eachlevel[lv];
524 e = s[lv];
525 while (p < e)
526 *d++ = *p++;
527 }
528 if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
529 *d++ = '\0';
530 *d++ = '\0';
531 }
532 }
533
534 for (lv = level; lv < MaxLevel; lv++) {
535 if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
536 *d++ = '\0';
537 *d++ = '\0';
538 }
539 }
540
541 for (lv = 0; lv < level; lv++) {
542 Safefree(eachlevel[lv]);
543 }
544 }
545 *d = '\0';
546 SvCUR_set(dst, d - (U8*)SvPVX(dst));
547 RETVAL = dst;
548OUTPUT:
549 RETVAL
550
551
552SV*
19265284
CBW
553varCE (self, vce)
554 SV* self;
555 SV* vce;
f58b9ef1 556 PREINIT:
19265284
CBW
557 SV *dst, *vbl, **svp;
558 HV *selfHV;
f58b9ef1
CBW
559 U8 *a, *v, *d;
560 STRLEN alen, vlen;
19265284
CBW
561 bool ig_l2;
562 UV totwt;
f58b9ef1 563 CODE:
19265284
CBW
564 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
565 selfHV = (HV*)SvRV(self);
566 else
567 croak("$self is not a HASHREF.");
568
569 svp = hv_fetch(selfHV, "ignore_level2", 13, FALSE);
570 ig_l2 = svp ? SvTRUE(*svp) : FALSE;
571
572 svp = hv_fetch(selfHV, "variable", 8, FALSE);
573 vbl = svp ? *svp : &PL_sv_no;
f58b9ef1
CBW
574 a = (U8*)SvPV(vbl, alen);
575 v = (U8*)SvPV(vce, vlen);
576
577 dst = newSV(vlen);
578 d = (U8*)SvPVX(dst);
579 (void)SvPOK_only(dst);
580 Copy(v, d, vlen, U8);
581 SvCUR_set(dst, vlen);
582 d[vlen] = '\0';
583
19265284
CBW
584 /* primary weight == 0 && secondary weight != 0 */
585 if (ig_l2 && !d[1] && !d[2] && (d[3] || d[4])) {
586 d[3] = d[4] = d[5] = d[6] = '\0';
587 }
588
f58b9ef1
CBW
589 /* variable: checked only the first char and the length,
590 trusting checkCollator() and %VariableOK in Perl ... */
591
592 if (vlen < VCE_Length /* ignore short VCE (unexpected) */
593 ||
19265284 594 *a == 'n') /* non-ignorable */
f58b9ef1
CBW
595 1;
596 else if (*v) {
597 if (*a == 's') { /* shifted or shift-trimmed */
598 d[7] = d[1]; /* wt level 1 to 4 */
599 d[8] = d[2];
19265284
CBW
600 } /* else blanked */
601
f58b9ef1
CBW
602 d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
603 }
604 else if (*a == 'b') /* blanked */
605 1;
606 else if (*a == 's') { /* shifted or shift-trimmed */
19265284
CBW
607 totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
608 if (alen == 7 && totwt != 0) { /* shifted */
f58b9ef1
CBW
609 d[7] = (U8)(Shift4Wt >> 8);
610 d[8] = (U8)(Shift4Wt & 0xFF);
611 }
19265284
CBW
612 else { /* shift-trimmed */
613 d[7] = d[8] = '\0';
f58b9ef1
CBW
614 }
615 }
616 else
617 croak("unknown variable value '%s'", a);
618 RETVAL = dst;
619OUTPUT:
620 RETVAL
621
622
623
624SV*
625visualizeSortKey (self, key)
626 SV * self
627 SV * key
628 PREINIT:
629 HV *selfHV;
630 SV **svp, *dst;
631 U8 *s, *e, *d;
632 STRLEN klen, dlen;
633 UV uv;
634 IV uca_vers;
635 static char *upperhex = "0123456789ABCDEF";
636 CODE:
637 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
638 selfHV = (HV*)SvRV(self);
639 else
640 croak("$self is not a HASHREF.");
641
642 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
643 if (!svp)
644 croak("Panic: no $self->{UCA_Version} in visualizeSortKey");
645 uca_vers = SvIV(*svp);
646
647 s = (U8*)SvPV(key, klen);
648
649 /* slightly *longer* than the need, but I'm afraid of miscounting;
650 exactly: (klen / 2) * 5 + MaxLevel * 2 - 1 (excluding '\0')
651 = (klen / 2) * 5 - 1 # FFFF (16bit) and ' ' between 16bit units
652 + (MaxLevel - 1) * 2 # ' ' and '|' for level boundaries
653 + 2 # '[' and ']'
654 */
655 dlen = (klen / 2) * 5 + MaxLevel * 2 + 2;
656 dst = newSV(dlen);
657 (void)SvPOK_only(dst);
658 d = (U8*)SvPVX(dst);
659
660 *d++ = '[';
661 for (e = s + klen; s < e; s += 2) {
662 uv = (U16)(*s << 8 | s[1]);
663 if (uv) {
664 if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|')))
665 *d++ = ' ';
666 *d++ = upperhex[ (s[0] >> 4) & 0xF ];
667 *d++ = upperhex[ s[0] & 0xF ];
668 *d++ = upperhex[ (s[1] >> 4) & 0xF ];
669 *d++ = upperhex[ s[1] & 0xF ];
670 }
671 else {
672 if ((9 <= uca_vers) && (d[-1] != '['))
673 *d++ = ' ';
674 *d++ = '|';
675 }
676 }
677 *d++ = ']';
678 *d = '\0';
679 SvCUR_set(dst, d - (U8*)SvPVX(dst));
680 RETVAL = dst;
681OUTPUT:
682 RETVAL
683
684
685
686void
687unpack_U (src)
688 SV* src
689 PREINIT:
690 STRLEN srclen, retlen;
691 U8 *s, *p, *e;
692 UV uv;
693 PPCODE:
694 s = (U8*)SvPV(src,srclen);
695 if (!SvUTF8(src)) {
696 SV* tmpsv = sv_mortalcopy(src);
697 if (!SvPOK(tmpsv))
698 (void)sv_pvn_force(tmpsv,&srclen);
699 sv_utf8_upgrade(tmpsv);
700 s = (U8*)SvPV(tmpsv,srclen);
701 }
702 e = s + srclen;
703
704 for (p = s; p < e; p += retlen) {
705 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
706 if (!retlen)
707 croak(ErrRetlenIsZero);
708 XPUSHs(sv_2mortal(newSVuv(uv)));
709 }
710