This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: White space only; no code changes
[perl5.git] / cpan / Unicode-Collate / Collate.xs
CommitLineData
cd7f10f7
CBW
1
2#define PERL_NO_GET_CONTEXT /* we want efficiency */
3
4/* I guese no private function needs pTHX_ and aTHX_ */
5
f58b9ef1
CBW
6#include "EXTERN.h"
7#include "perl.h"
8#include "XSUB.h"
9
10/* This file is prepared by mkheader */
11#include "ucatbl.h"
12
13/* Perl 5.6.1 ? */
14#ifndef utf8n_to_uvuni
15#define utf8n_to_uvuni utf8_to_uv
16#endif /* utf8n_to_uvuni */
17
18/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
19#ifndef UTF8_ALLOW_BOM
20#define UTF8_ALLOW_BOM (0)
21#endif /* UTF8_ALLOW_BOM */
22
23#ifndef UTF8_ALLOW_SURROGATE
24#define UTF8_ALLOW_SURROGATE (0)
25#endif /* UTF8_ALLOW_SURROGATE */
26
27#ifndef UTF8_ALLOW_FE_FF
28#define UTF8_ALLOW_FE_FF (0)
29#endif /* UTF8_ALLOW_FE_FF */
30
31#ifndef UTF8_ALLOW_FFFF
32#define UTF8_ALLOW_FFFF (0)
33#endif /* UTF8_ALLOW_FFFF */
34
35#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
36
37/* if utf8n_to_uvuni() sets retlen to 0 (?) */
38#define ErrRetlenIsZero "panic (Unicode::Collate): zero-length character"
39
40/* At present, char > 0x10ffff are unaffected without complaint, right? */
41#define VALID_UTF_MAX (0x10ffff)
42#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
43
44static const UV max_div_16 = UV_MAX / 16;
45
46/* Supported Levels */
47#define MinLevel (1)
48#define MaxLevel (4)
49
50/* Shifted weight at 4th level */
51#define Shift4Wt (0xFFFF)
52
53#define VCE_Length (9)
54
55#define Hangul_SBase (0xAC00)
56#define Hangul_SIni (0xAC00)
57#define Hangul_SFin (0xD7A3)
58#define Hangul_NCount (588)
59#define Hangul_TCount (28)
60#define Hangul_LBase (0x1100)
61#define Hangul_LIni (0x1100)
62#define Hangul_LFin (0x1159)
63#define Hangul_LFill (0x115F)
64#define Hangul_LEnd (0x115F) /* Unicode 5.2 */
65#define Hangul_VBase (0x1161)
66#define Hangul_VIni (0x1160) /* from Vowel Filler */
67#define Hangul_VFin (0x11A2)
68#define Hangul_VEnd (0x11A7) /* Unicode 5.2 */
69#define Hangul_TBase (0x11A7) /* from "no-final" codepoint */
70#define Hangul_TIni (0x11A8)
71#define Hangul_TFin (0x11F9)
72#define Hangul_TEnd (0x11FF) /* Unicode 5.2 */
73#define HangulL2Ini (0xA960) /* Unicode 5.2 */
74#define HangulL2Fin (0xA97C) /* Unicode 5.2 */
75#define HangulV2Ini (0xD7B0) /* Unicode 5.2 */
76#define HangulV2Fin (0xD7C6) /* Unicode 5.2 */
77#define HangulT2Ini (0xD7CB) /* Unicode 5.2 */
78#define HangulT2Fin (0xD7FB) /* Unicode 5.2 */
79
80#define CJK_UidIni (0x4E00)
81#define CJK_UidFin (0x9FA5)
82#define CJK_UidF41 (0x9FBB)
83#define CJK_UidF51 (0x9FC3)
84#define CJK_UidF52 (0x9FCB)
cba8842c 85#define CJK_UidF61 (0x9FCC)
f58b9ef1
CBW
86#define CJK_ExtAIni (0x3400) /* Unicode 3.0 */
87#define CJK_ExtAFin (0x4DB5) /* Unicode 3.0 */
88#define CJK_ExtBIni (0x20000) /* Unicode 3.1 */
89#define CJK_ExtBFin (0x2A6D6) /* Unicode 3.1 */
90#define CJK_ExtCIni (0x2A700) /* Unicode 5.2 */
91#define CJK_ExtCFin (0x2B734) /* Unicode 5.2 */
92#define CJK_ExtDIni (0x2B740) /* Unicode 6.0 */
93#define CJK_ExtDFin (0x2B81D) /* Unicode 6.0 */
94
46267efc
CBW
95#define CJK_CompIni (0xFA0E)
96#define CJK_CompFin (0xFA29)
f58b9ef1
CBW
97static STDCHAR UnifiedCompat[] = {
98 1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1
99}; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */
100
101#define codeRange(bcode, ecode) ((bcode) <= code && code <= (ecode))
102
103MODULE = Unicode::Collate PACKAGE = Unicode::Collate
104
105PROTOTYPES: DISABLE
106
107void
108_fetch_rest ()
109 PREINIT:
110 char ** rest;
111 PPCODE:
112 for (rest = UCA_rest; *rest; ++rest) {
113 XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0)));
114 }
115
116
117void
118_fetch_simple (uv)
119 UV uv
120 PREINIT:
121 U8 ***plane, **row;
122 U8* result = NULL;
123 PPCODE:
124 if (!OVER_UTF_MAX(uv)){
125 plane = (U8***)UCA_simple[uv >> 16];
126 if (plane) {
127 row = plane[(uv >> 8) & 0xff];
128 result = row ? row[uv & 0xff] : NULL;
129 }
130 }
131 if (result) {
132 int i;
133 int num = (int)*result;
134 ++result;
135 for (i = 0; i < num; ++i) {
136 XPUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length)));
137 result += VCE_Length;
138 }
139 } else {
140 XPUSHs(sv_2mortal(newSViv(0)));
141 }
142
143SV*
144_ignorable_simple (uv)
145 UV uv
146 ALIAS:
147 _exists_simple = 1
148 PREINIT:
149 U8 ***plane, **row;
150 int num = -1;
151 U8* result = NULL;
152 CODE:
153 if (!OVER_UTF_MAX(uv)){
154 plane = (U8***)UCA_simple[uv >> 16];
155 if (plane) {
156 row = plane[(uv >> 8) & 0xff];
157 result = row ? row[uv & 0xff] : NULL;
158 }
159 if (result)
160 num = (int)*result; /* assuming 0 <= num < 128 */
161 }
162
163 if (ix)
164 RETVAL = boolSV(num >0);
165 else
166 RETVAL = boolSV(num==0);
167 OUTPUT:
168 RETVAL
169
170
171void
172_getHexArray (src)
173 SV* src
174 PREINIT:
175 char *s, *e;
176 STRLEN byte;
177 UV value;
178 bool overflowed = FALSE;
179 const char *hexdigit;
180 PPCODE:
181 s = SvPV(src,byte);
182 for (e = s + byte; s < e;) {
183 hexdigit = strchr((char *) PL_hexdigit, *s++);
184 if (! hexdigit)
185 continue;
186 value = (hexdigit - PL_hexdigit) & 0xF;
187 while (*s) {
188 hexdigit = strchr((char *) PL_hexdigit, *s++);
189 if (! hexdigit)
190 break;
191 if (overflowed)
192 continue;
193 if (value > max_div_16) {
194 overflowed = TRUE;
195 continue;
196 }
197 value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF);
198 }
199 XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value)));
200 }
201
202
203SV*
204_isIllegal (sv)
205 SV* sv
206 PREINIT:
207 UV uv;
208 CODE:
209 if (!sv || !SvIOK(sv))
210 XSRETURN_YES;
211 uv = SvUVX(sv);
212 RETVAL = boolSV(
213 0x10FFFF < uv /* out of range */
214 );
215OUTPUT:
216 RETVAL
217
218
219SV*
220_isNonchar (sv)
221 SV* sv
222 PREINIT:
223 UV uv;
224 CODE:
225 /* should be called only if ! _isIllegal(sv). */
226 uv = SvUVX(sv);
227 RETVAL = boolSV(
228 ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] (cf. utf8.c) */
229 || (0xD800 <= uv && uv <= 0xDFFF) /* unpaired surrogates */
230 || (0xFDD0 <= uv && uv <= 0xFDEF) /* other non-characters */
231 );
232OUTPUT:
233 RETVAL
234
235
236void
237_decompHangul (code)
238 UV code
239 PREINIT:
240 UV sindex, lindex, vindex, tindex;
241 PPCODE:
242 /* code *must* be in Hangul syllable.
243 * Check it before you enter here. */
244 sindex = code - Hangul_SBase;
245 lindex = sindex / Hangul_NCount;
246 vindex = (sindex % Hangul_NCount) / Hangul_TCount;
247 tindex = sindex % Hangul_TCount;
248
249 XPUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase)));
250 XPUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase)));
251 if (tindex)
252 XPUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase)));
253
254
255SV*
256getHST (code, uca_vers = 0)
257 UV code;
258 IV uca_vers;
259 PREINIT:
0cb4637e 260 const char * hangtype;
f58b9ef1
CBW
261 STRLEN typelen;
262 CODE:
263 if (codeRange(Hangul_SIni, Hangul_SFin)) {
264 if ((code - Hangul_SBase) % Hangul_TCount) {
265 hangtype = "LVT"; typelen = 3;
266 } else {
267 hangtype = "LV"; typelen = 2;
268 }
269 } else if (uca_vers < 20) {
270 if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) {
271 hangtype = "L"; typelen = 1;
272 } else if (codeRange(Hangul_VIni, Hangul_VFin)) {
273 hangtype = "V"; typelen = 1;
274 } else if (codeRange(Hangul_TIni, Hangul_TFin)) {
275 hangtype = "T"; typelen = 1;
276 } else {
277 hangtype = ""; typelen = 0;
278 }
279 } else {
280 if (codeRange(Hangul_LIni, Hangul_LEnd) ||
281 codeRange(HangulL2Ini, HangulL2Fin)) {
282 hangtype = "L"; typelen = 1;
283 } else if (codeRange(Hangul_VIni, Hangul_VEnd) ||
284 codeRange(HangulV2Ini, HangulV2Fin)) {
285 hangtype = "V"; typelen = 1;
286 } else if (codeRange(Hangul_TIni, Hangul_TEnd) ||
287 codeRange(HangulT2Ini, HangulT2Fin)) {
288 hangtype = "T"; typelen = 1;
289 } else {
290 hangtype = ""; typelen = 0;
291 }
292 }
293
294 RETVAL = newSVpvn(hangtype, typelen);
295OUTPUT:
296 RETVAL
297
298
299void
300_derivCE_9 (code)
301 UV code
302 ALIAS:
303 _derivCE_14 = 1
304 _derivCE_18 = 2
305 _derivCE_20 = 3
306 _derivCE_22 = 4
cba8842c 307 _derivCE_24 = 5
f58b9ef1
CBW
308 PREINIT:
309 UV base, aaaa, bbbb;
310 U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF";
311 U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF";
312 bool basic_unified = 0;
313 PPCODE:
314 if (CJK_UidIni <= code) {
46267efc
CBW
315 if (codeRange(CJK_CompIni, CJK_CompFin))
316 basic_unified = (bool)UnifiedCompat[code - CJK_CompIni];
f58b9ef1 317 else
cba8842c
A
318 basic_unified = (ix >= 5 ? (code <= CJK_UidF61) :
319 ix >= 3 ? (code <= CJK_UidF52) :
f58b9ef1
CBW
320 ix == 2 ? (code <= CJK_UidF51) :
321 ix == 1 ? (code <= CJK_UidF41) :
322 (code <= CJK_UidFin));
323 }
324 base = (basic_unified)
325 ? 0xFB40 : /* CJK */
326 ((codeRange(CJK_ExtAIni, CJK_ExtAFin))
327 ||
328 (codeRange(CJK_ExtBIni, CJK_ExtBFin))
329 ||
330 (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
331 ||
332 (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin)))
333 ? 0xFB80 /* CJK ext. */
334 : 0xFBC0; /* others */
335 aaaa = base + (code >> 15);
336 bbbb = (code & 0x7FFF) | 0x8000;
337 a[1] = (U8)(aaaa >> 8);
338 a[2] = (U8)(aaaa & 0xFF);
339 b[1] = (U8)(bbbb >> 8);
340 b[2] = (U8)(bbbb & 0xFF);
341 a[7] = b[7] = (U8)(code >> 8);
342 a[8] = b[8] = (U8)(code & 0xFF);
343 XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
344 XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
345
346
347void
348_derivCE_8 (code)
349 UV code
350 PREINIT:
351 UV aaaa, bbbb;
352 U8 a[VCE_Length + 1] = "\x00\xFF\xFF\x00\x02\x00\x01\xFF\xFF";
353 U8 b[VCE_Length + 1] = "\x00\xFF\xFF\x00\x00\x00\x00\xFF\xFF";
354 PPCODE:
355 aaaa = 0xFF80 + (code >> 15);
356 bbbb = (code & 0x7FFF) | 0x8000;
357 a[1] = (U8)(aaaa >> 8);
358 a[2] = (U8)(aaaa & 0xFF);
359 b[1] = (U8)(bbbb >> 8);
360 b[2] = (U8)(bbbb & 0xFF);
361 a[7] = b[7] = (U8)(code >> 8);
362 a[8] = b[8] = (U8)(code & 0xFF);
363 XPUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
364 XPUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
365
366
367void
368_uideoCE_8 (code)
369 UV code
370 PREINIT:
371 U8 uice[VCE_Length + 1] = "\x00\xFF\xFF\x00\x20\x00\x02\xFF\xFF";
372 PPCODE:
373 uice[1] = uice[7] = (U8)(code >> 8);
374 uice[2] = uice[8] = (U8)(code & 0xFF);
375 XPUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length)));
376
377
378SV*
379_isUIdeo (code, uca_vers)
380 UV code;
381 IV uca_vers;
382 bool basic_unified = 0;
383 CODE:
384 /* uca_vers = 0 for _uideoCE_8() */
385 if (CJK_UidIni <= code) {
46267efc
CBW
386 if (codeRange(CJK_CompIni, CJK_CompFin))
387 basic_unified = (bool)UnifiedCompat[code - CJK_CompIni];
f58b9ef1 388 else
cba8842c
A
389 basic_unified = (uca_vers >= 24 ? (code <= CJK_UidF61) :
390 uca_vers >= 20 ? (code <= CJK_UidF52) :
f58b9ef1
CBW
391 uca_vers >= 18 ? (code <= CJK_UidF51) :
392 uca_vers >= 14 ? (code <= CJK_UidF41) :
393 (code <= CJK_UidFin));
394 }
395 RETVAL = boolSV(
396 (basic_unified)
397 ||
398 (codeRange(CJK_ExtAIni, CJK_ExtAFin))
399 ||
400 (uca_vers >= 8 && codeRange(CJK_ExtBIni, CJK_ExtBFin))
401 ||
402 (uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
403 ||
404 (uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin))
405 );
406OUTPUT:
407 RETVAL
408
409
410SV*
411mk_SortKey (self, buf)
412 SV* self;
413 SV* buf;
414 PREINIT:
415 SV *dst, **svp;
416 STRLEN dlen, vlen;
417 U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel];
418 AV *bufAV;
419 HV *selfHV;
420 UV back_flag;
421 I32 i, buf_len;
422 IV lv, level, uca_vers;
423 bool upper_lower, kata_hira, v2i, last_is_var;
424 CODE:
425 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
426 selfHV = (HV*)SvRV(self);
427 else
428 croak("$self is not a HASHREF.");
429
f58b9ef1
CBW
430 if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV)
431 bufAV = (AV*)SvRV(buf);
432 else
433 croak("XSUB, not an ARRAYREF.");
434
435 buf_len = av_len(bufAV);
436
437 if (buf_len < 0) { /* empty: -1 */
438 dlen = 2 * (MaxLevel - 1);
439 dst = newSV(dlen);
440 (void)SvPOK_only(dst);
441 d = (U8*)SvPVX(dst);
442 while (dlen--)
443 *d++ = '\0';
19265284
CBW
444 } else {
445 svp = hv_fetch(selfHV, "level", 5, FALSE);
446 level = svp ? SvIV(*svp) : MaxLevel;
447
f58b9ef1
CBW
448 for (lv = 0; lv < level; lv++) {
449 New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8);
450 s[lv] = eachlevel[lv];
451 }
452
453 svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE);
454 upper_lower = svp ? SvTRUE(*svp) : FALSE;
455 svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE);
456 kata_hira = svp ? SvTRUE(*svp) : FALSE;
457 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
458 uca_vers = SvIV(*svp);
459 svp = hv_fetch(selfHV, "variable", 8, FALSE);
460 v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */
461 ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13))
462 : FALSE;
463
464 last_is_var = FALSE;
465 for (i = 0; i <= buf_len; i++) {
466 svp = av_fetch(bufAV, i, FALSE);
467
468 if (svp && SvPOK(*svp))
469 v = (U8*)SvPV(*svp, vlen);
470 else
471 croak("not a vwt.");
472
473 if (vlen < VCE_Length) /* ignore short VCE (unexpected) */
474 continue;
475
476 /* "Ignorable (L1, L2) after Variable" since track. v. 9 */
477 if (v2i) {
478 if (*v)
479 last_is_var = TRUE;
480 else if (v[1] || v[2]) /* non zero primary weight */
481 last_is_var = FALSE;
482 else if (last_is_var) /* zero primary weight; skipped */
483 continue;
484 }
485
486 if (v[5] == 0) { /* tert wt < 256 */
487 if (upper_lower) {
488 if (0x8 <= v[6] && v[6] <= 0xC) /* lower */
489 v[6] -= 6;
490 else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */
491 v[6] += 6;
492 else if (v[6] == 0x1C) /* square upper */
493 v[6]++;
494 else if (v[6] == 0x1D) /* square lower */
495 v[6]--;
496 }
497 if (kata_hira) {
498 if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */
499 v[6] -= 2;
500 else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */
501 v[6] += 5;
502 }
503 }
504
505 for (lv = 0; lv < level; lv++) {
506 if (v[2 * lv + 1] || v[2 * lv + 2]) {
507 *s[lv]++ = v[2 * lv + 1];
508 *s[lv]++ = v[2 * lv + 2];
509 }
510 }
511 }
512
513 dlen = 2 * (MaxLevel - 1);
514 for (lv = 0; lv < level; lv++)
515 dlen += s[lv] - eachlevel[lv];
516
517 dst = newSV(dlen);
518 (void)SvPOK_only(dst);
519 d = (U8*)SvPVX(dst);
520
521 svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE);
522 back_flag = svp ? SvUV(*svp) : (UV)0;
523
524 for (lv = 0; lv < level; lv++) {
525 if (back_flag & (1 << (lv + 1))) {
526 p = s[lv];
527 e = eachlevel[lv];
528 for ( ; e < p; p -= 2) {
529 *d++ = p[-2];
530 *d++ = p[-1];
531 }
532 }
533 else {
534 p = eachlevel[lv];
535 e = s[lv];
536 while (p < e)
537 *d++ = *p++;
538 }
539 if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
540 *d++ = '\0';
541 *d++ = '\0';
542 }
543 }
544
545 for (lv = level; lv < MaxLevel; lv++) {
546 if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
547 *d++ = '\0';
548 *d++ = '\0';
549 }
550 }
551
552 for (lv = 0; lv < level; lv++) {
553 Safefree(eachlevel[lv]);
554 }
555 }
556 *d = '\0';
557 SvCUR_set(dst, d - (U8*)SvPVX(dst));
558 RETVAL = dst;
559OUTPUT:
560 RETVAL
561
562
563SV*
19265284
CBW
564varCE (self, vce)
565 SV* self;
566 SV* vce;
f58b9ef1 567 PREINIT:
19265284
CBW
568 SV *dst, *vbl, **svp;
569 HV *selfHV;
f58b9ef1
CBW
570 U8 *a, *v, *d;
571 STRLEN alen, vlen;
19265284
CBW
572 bool ig_l2;
573 UV totwt;
f58b9ef1 574 CODE:
19265284
CBW
575 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
576 selfHV = (HV*)SvRV(self);
577 else
578 croak("$self is not a HASHREF.");
579
580 svp = hv_fetch(selfHV, "ignore_level2", 13, FALSE);
581 ig_l2 = svp ? SvTRUE(*svp) : FALSE;
582
583 svp = hv_fetch(selfHV, "variable", 8, FALSE);
584 vbl = svp ? *svp : &PL_sv_no;
f58b9ef1
CBW
585 a = (U8*)SvPV(vbl, alen);
586 v = (U8*)SvPV(vce, vlen);
587
588 dst = newSV(vlen);
589 d = (U8*)SvPVX(dst);
590 (void)SvPOK_only(dst);
591 Copy(v, d, vlen, U8);
592 SvCUR_set(dst, vlen);
593 d[vlen] = '\0';
594
19265284
CBW
595 /* primary weight == 0 && secondary weight != 0 */
596 if (ig_l2 && !d[1] && !d[2] && (d[3] || d[4])) {
597 d[3] = d[4] = d[5] = d[6] = '\0';
598 }
599
f58b9ef1
CBW
600 /* variable: checked only the first char and the length,
601 trusting checkCollator() and %VariableOK in Perl ... */
602
603 if (vlen < VCE_Length /* ignore short VCE (unexpected) */
604 ||
19265284 605 *a == 'n') /* non-ignorable */
f58b9ef1
CBW
606 1;
607 else if (*v) {
608 if (*a == 's') { /* shifted or shift-trimmed */
609 d[7] = d[1]; /* wt level 1 to 4 */
610 d[8] = d[2];
19265284
CBW
611 } /* else blanked */
612
f58b9ef1
CBW
613 d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
614 }
615 else if (*a == 'b') /* blanked */
616 1;
617 else if (*a == 's') { /* shifted or shift-trimmed */
19265284
CBW
618 totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
619 if (alen == 7 && totwt != 0) { /* shifted */
750da838
CBW
620 if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */
621 d[7] = d[1]; /* wt level 1 to 4 */
622 d[8] = d[2];
623 } else {
624 d[7] = (U8)(Shift4Wt >> 8);
625 d[8] = (U8)(Shift4Wt & 0xFF);
626 }
627 } else { /* shift-trimmed */
19265284 628 d[7] = d[8] = '\0';
f58b9ef1
CBW
629 }
630 }
631 else
632 croak("unknown variable value '%s'", a);
633 RETVAL = dst;
634OUTPUT:
635 RETVAL
636
637
638
639SV*
640visualizeSortKey (self, key)
641 SV * self
642 SV * key
643 PREINIT:
644 HV *selfHV;
645 SV **svp, *dst;
646 U8 *s, *e, *d;
647 STRLEN klen, dlen;
648 UV uv;
750da838 649 IV uca_vers, sep = 0;
0cb4637e 650 static const char *upperhex = "0123456789ABCDEF";
f58b9ef1
CBW
651 CODE:
652 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
653 selfHV = (HV*)SvRV(self);
654 else
655 croak("$self is not a HASHREF.");
656
657 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
658 if (!svp)
659 croak("Panic: no $self->{UCA_Version} in visualizeSortKey");
660 uca_vers = SvIV(*svp);
661
662 s = (U8*)SvPV(key, klen);
663
664 /* slightly *longer* than the need, but I'm afraid of miscounting;
750da838
CBW
665 = (klen / 2) * 5 - 1
666 # FFFF and ' ' for each 16bit units but ' ' is less by 1;
667 # ' ' and '|' for level boundaries including the identical level
668 + 2 # '[' and ']'
669 + 1 # '\0'
670 (a) if klen is odd (not expected), maybe more 5 bytes.
671 (b) there is not always the identical level.
f58b9ef1
CBW
672 */
673 dlen = (klen / 2) * 5 + MaxLevel * 2 + 2;
674 dst = newSV(dlen);
675 (void)SvPOK_only(dst);
676 d = (U8*)SvPVX(dst);
677
678 *d++ = '[';
679 for (e = s + klen; s < e; s += 2) {
680 uv = (U16)(*s << 8 | s[1]);
750da838 681 if (uv || sep >= MaxLevel) {
f58b9ef1
CBW
682 if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|')))
683 *d++ = ' ';
684 *d++ = upperhex[ (s[0] >> 4) & 0xF ];
685 *d++ = upperhex[ s[0] & 0xF ];
686 *d++ = upperhex[ (s[1] >> 4) & 0xF ];
687 *d++ = upperhex[ s[1] & 0xF ];
750da838 688 } else {
f58b9ef1
CBW
689 if ((9 <= uca_vers) && (d[-1] != '['))
690 *d++ = ' ';
691 *d++ = '|';
750da838 692 ++sep;
f58b9ef1
CBW
693 }
694 }
695 *d++ = ']';
696 *d = '\0';
697 SvCUR_set(dst, d - (U8*)SvPVX(dst));
698 RETVAL = dst;
699OUTPUT:
700 RETVAL
701
702
703
704void
705unpack_U (src)
706 SV* src
707 PREINIT:
708 STRLEN srclen, retlen;
709 U8 *s, *p, *e;
710 UV uv;
711 PPCODE:
712 s = (U8*)SvPV(src,srclen);
713 if (!SvUTF8(src)) {
714 SV* tmpsv = sv_mortalcopy(src);
715 if (!SvPOK(tmpsv))
716 (void)sv_pvn_force(tmpsv,&srclen);
717 sv_utf8_upgrade(tmpsv);
718 s = (U8*)SvPV(tmpsv,srclen);
719 }
720 e = s + srclen;
721
722 for (p = s; p < e; p += retlen) {
723 uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
724 if (!retlen)
725 croak(ErrRetlenIsZero);
726 XPUSHs(sv_2mortal(newSVuv(uv)));
727 }
728