Commit | Line | Data |
---|---|---|
cd7f10f7 CBW |
1 | |
2 | #define PERL_NO_GET_CONTEXT /* we want efficiency */ | |
3 | ||
4 | /* I guese no private function needs pTHX_ and aTHX_ */ | |
5 | ||
f58b9ef1 CBW |
6 | #include "EXTERN.h" |
7 | #include "perl.h" | |
8 | #include "XSUB.h" | |
9 | ||
10 | /* This file is prepared by mkheader */ | |
11 | #include "ucatbl.h" | |
12 | ||
f58b9ef1 CBW |
13 | /* At present, char > 0x10ffff are unaffected without complaint, right? */ |
14 | #define VALID_UTF_MAX (0x10ffff) | |
15 | #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) | |
16 | ||
9d8690d8 | 17 | #define MAX_DIV_16 (UV_MAX / 16) |
f58b9ef1 CBW |
18 | |
19 | /* Supported Levels */ | |
20 | #define MinLevel (1) | |
21 | #define MaxLevel (4) | |
22 | ||
23 | /* Shifted weight at 4th level */ | |
24 | #define Shift4Wt (0xFFFF) | |
25 | ||
26 | #define VCE_Length (9) | |
27 | ||
28 | #define Hangul_SBase (0xAC00) | |
29 | #define Hangul_SIni (0xAC00) | |
30 | #define Hangul_SFin (0xD7A3) | |
31 | #define Hangul_NCount (588) | |
32 | #define Hangul_TCount (28) | |
33 | #define Hangul_LBase (0x1100) | |
34 | #define Hangul_LIni (0x1100) | |
35 | #define Hangul_LFin (0x1159) | |
36 | #define Hangul_LFill (0x115F) | |
37 | #define Hangul_LEnd (0x115F) /* Unicode 5.2 */ | |
38 | #define Hangul_VBase (0x1161) | |
39 | #define Hangul_VIni (0x1160) /* from Vowel Filler */ | |
40 | #define Hangul_VFin (0x11A2) | |
41 | #define Hangul_VEnd (0x11A7) /* Unicode 5.2 */ | |
42 | #define Hangul_TBase (0x11A7) /* from "no-final" codepoint */ | |
43 | #define Hangul_TIni (0x11A8) | |
44 | #define Hangul_TFin (0x11F9) | |
45 | #define Hangul_TEnd (0x11FF) /* Unicode 5.2 */ | |
46 | #define HangulL2Ini (0xA960) /* Unicode 5.2 */ | |
47 | #define HangulL2Fin (0xA97C) /* Unicode 5.2 */ | |
48 | #define HangulV2Ini (0xD7B0) /* Unicode 5.2 */ | |
49 | #define HangulV2Fin (0xD7C6) /* Unicode 5.2 */ | |
50 | #define HangulT2Ini (0xD7CB) /* Unicode 5.2 */ | |
51 | #define HangulT2Fin (0xD7FB) /* Unicode 5.2 */ | |
52 | ||
53 | #define CJK_UidIni (0x4E00) | |
54 | #define CJK_UidFin (0x9FA5) | |
eaecde9a CBW |
55 | #define CJK_UidF41 (0x9FBB) /* Unicode 4.1 */ |
56 | #define CJK_UidF51 (0x9FC3) /* Unicode 5.1 */ | |
57 | #define CJK_UidF52 (0x9FCB) /* Unicode 5.2 */ | |
58 | #define CJK_UidF61 (0x9FCC) /* Unicode 6.1 */ | |
59 | #define CJK_UidF80 (0x9FD5) /* Unicode 8.0 */ | |
60 | #define CJK_UidF100 (0x9FEA) /* Unicode 10.0 */ | |
61 | ||
f58b9ef1 CBW |
62 | #define CJK_ExtAIni (0x3400) /* Unicode 3.0 */ |
63 | #define CJK_ExtAFin (0x4DB5) /* Unicode 3.0 */ | |
64 | #define CJK_ExtBIni (0x20000) /* Unicode 3.1 */ | |
65 | #define CJK_ExtBFin (0x2A6D6) /* Unicode 3.1 */ | |
66 | #define CJK_ExtCIni (0x2A700) /* Unicode 5.2 */ | |
67 | #define CJK_ExtCFin (0x2B734) /* Unicode 5.2 */ | |
68 | #define CJK_ExtDIni (0x2B740) /* Unicode 6.0 */ | |
69 | #define CJK_ExtDFin (0x2B81D) /* Unicode 6.0 */ | |
fcd4e2f8 SH |
70 | #define CJK_ExtEIni (0x2B820) /* Unicode 8.0 */ |
71 | #define CJK_ExtEFin (0x2CEA1) /* Unicode 8.0 */ | |
eaecde9a CBW |
72 | #define CJK_ExtFIni (0x2CEB0) /* Unicode 10.0 */ |
73 | #define CJK_ExtFFin (0x2EBE0) /* Unicode 10.0 */ | |
f58b9ef1 | 74 | |
46267efc CBW |
75 | #define CJK_CompIni (0xFA0E) |
76 | #define CJK_CompFin (0xFA29) | |
9d8690d8 | 77 | static const STDCHAR UnifiedCompat[] = { |
f58b9ef1 CBW |
78 | 1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1 |
79 | }; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */ | |
80 | ||
6608d2d5 CBW |
81 | #define TangIdeoIni (0x17000) /* Unicode 9.0 */ |
82 | #define TangIdeoFin (0x187EC) /* Unicode 9.0 */ | |
83 | #define TangCompIni (0x18800) /* Unicode 9.0 */ | |
84 | #define TangCompFin (0x18AF2) /* Unicode 9.0 */ | |
eaecde9a CBW |
85 | #define NushuIni (0x1B170) /* Unicode 10.0 */ |
86 | #define NushuFin (0x1B2FB) /* Unicode 10.0 */ | |
6608d2d5 | 87 | |
f58b9ef1 CBW |
88 | #define codeRange(bcode, ecode) ((bcode) <= code && code <= (ecode)) |
89 | ||
90 | MODULE = Unicode::Collate PACKAGE = Unicode::Collate | |
91 | ||
92 | PROTOTYPES: DISABLE | |
93 | ||
94 | void | |
95 | _fetch_rest () | |
96 | PREINIT: | |
97 | char ** rest; | |
98 | PPCODE: | |
9d8690d8 | 99 | for (rest = (char **)UCA_rest; *rest; ++rest) { |
f58b9ef1 CBW |
100 | XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0))); |
101 | } | |
102 | ||
103 | ||
104 | void | |
105 | _fetch_simple (uv) | |
106 | UV uv | |
107 | PREINIT: | |
108 | U8 ***plane, **row; | |
109 | U8* result = NULL; | |
110 | PPCODE: | |
111 | if (!OVER_UTF_MAX(uv)){ | |
112 | plane = (U8***)UCA_simple[uv >> 16]; | |
113 | if (plane) { | |
114 | row = plane[(uv >> 8) & 0xff]; | |
115 | result = row ? row[uv & 0xff] : NULL; | |
116 | } | |
117 | } | |
118 | if (result) { | |
119 | int i; | |
120 | int num = (int)*result; | |
121 | ++result; | |
9d8690d8 | 122 | EXTEND(SP, num); |
f58b9ef1 | 123 | for (i = 0; i < num; ++i) { |
9d8690d8 | 124 | PUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length))); |
f58b9ef1 CBW |
125 | result += VCE_Length; |
126 | } | |
127 | } else { | |
9d8690d8 | 128 | PUSHs(sv_2mortal(newSViv(0))); |
f58b9ef1 CBW |
129 | } |
130 | ||
131 | SV* | |
132 | _ignorable_simple (uv) | |
133 | UV uv | |
134 | ALIAS: | |
135 | _exists_simple = 1 | |
136 | PREINIT: | |
137 | U8 ***plane, **row; | |
138 | int num = -1; | |
139 | U8* result = NULL; | |
140 | CODE: | |
141 | if (!OVER_UTF_MAX(uv)){ | |
142 | plane = (U8***)UCA_simple[uv >> 16]; | |
143 | if (plane) { | |
144 | row = plane[(uv >> 8) & 0xff]; | |
145 | result = row ? row[uv & 0xff] : NULL; | |
146 | } | |
147 | if (result) | |
148 | num = (int)*result; /* assuming 0 <= num < 128 */ | |
149 | } | |
150 | ||
151 | if (ix) | |
152 | RETVAL = boolSV(num >0); | |
153 | else | |
154 | RETVAL = boolSV(num==0); | |
155 | OUTPUT: | |
156 | RETVAL | |
157 | ||
158 | ||
159 | void | |
160 | _getHexArray (src) | |
161 | SV* src | |
162 | PREINIT: | |
163 | char *s, *e; | |
164 | STRLEN byte; | |
165 | UV value; | |
166 | bool overflowed = FALSE; | |
167 | const char *hexdigit; | |
168 | PPCODE: | |
169 | s = SvPV(src,byte); | |
170 | for (e = s + byte; s < e;) { | |
171 | hexdigit = strchr((char *) PL_hexdigit, *s++); | |
9d8690d8 | 172 | if (! hexdigit) |
f58b9ef1 CBW |
173 | continue; |
174 | value = (hexdigit - PL_hexdigit) & 0xF; | |
175 | while (*s) { | |
176 | hexdigit = strchr((char *) PL_hexdigit, *s++); | |
177 | if (! hexdigit) | |
178 | break; | |
179 | if (overflowed) | |
180 | continue; | |
9d8690d8 | 181 | if (value > MAX_DIV_16) { |
f58b9ef1 CBW |
182 | overflowed = TRUE; |
183 | continue; | |
184 | } | |
185 | value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF); | |
186 | } | |
187 | XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value))); | |
188 | } | |
189 | ||
190 | ||
191 | SV* | |
192 | _isIllegal (sv) | |
193 | SV* sv | |
194 | PREINIT: | |
195 | UV uv; | |
196 | CODE: | |
197 | if (!sv || !SvIOK(sv)) | |
198 | XSRETURN_YES; | |
199 | uv = SvUVX(sv); | |
200 | RETVAL = boolSV( | |
60f577e0 | 201 | 0x10FFFF < uv /* out of range */ |
4779e03e | 202 | || ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] */ |
f58b9ef1 CBW |
203 | || (0xD800 <= uv && uv <= 0xDFFF) /* unpaired surrogates */ |
204 | || (0xFDD0 <= uv && uv <= 0xFDEF) /* other non-characters */ | |
205 | ); | |
206 | OUTPUT: | |
207 | RETVAL | |
208 | ||
209 | ||
210 | void | |
211 | _decompHangul (code) | |
212 | UV code | |
213 | PREINIT: | |
214 | UV sindex, lindex, vindex, tindex; | |
215 | PPCODE: | |
216 | /* code *must* be in Hangul syllable. | |
217 | * Check it before you enter here. */ | |
218 | sindex = code - Hangul_SBase; | |
219 | lindex = sindex / Hangul_NCount; | |
220 | vindex = (sindex % Hangul_NCount) / Hangul_TCount; | |
221 | tindex = sindex % Hangul_TCount; | |
222 | ||
9d8690d8 CBW |
223 | EXTEND(SP, tindex ? 3 : 2); |
224 | PUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase))); | |
225 | PUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase))); | |
f58b9ef1 | 226 | if (tindex) |
9d8690d8 | 227 | PUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase))); |
f58b9ef1 CBW |
228 | |
229 | ||
230 | SV* | |
231 | getHST (code, uca_vers = 0) | |
232 | UV code; | |
233 | IV uca_vers; | |
234 | PREINIT: | |
0cb4637e | 235 | const char * hangtype; |
f58b9ef1 CBW |
236 | STRLEN typelen; |
237 | CODE: | |
238 | if (codeRange(Hangul_SIni, Hangul_SFin)) { | |
239 | if ((code - Hangul_SBase) % Hangul_TCount) { | |
240 | hangtype = "LVT"; typelen = 3; | |
241 | } else { | |
242 | hangtype = "LV"; typelen = 2; | |
243 | } | |
244 | } else if (uca_vers < 20) { | |
245 | if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) { | |
246 | hangtype = "L"; typelen = 1; | |
247 | } else if (codeRange(Hangul_VIni, Hangul_VFin)) { | |
248 | hangtype = "V"; typelen = 1; | |
249 | } else if (codeRange(Hangul_TIni, Hangul_TFin)) { | |
250 | hangtype = "T"; typelen = 1; | |
251 | } else { | |
252 | hangtype = ""; typelen = 0; | |
253 | } | |
254 | } else { | |
255 | if (codeRange(Hangul_LIni, Hangul_LEnd) || | |
256 | codeRange(HangulL2Ini, HangulL2Fin)) { | |
257 | hangtype = "L"; typelen = 1; | |
258 | } else if (codeRange(Hangul_VIni, Hangul_VEnd) || | |
259 | codeRange(HangulV2Ini, HangulV2Fin)) { | |
260 | hangtype = "V"; typelen = 1; | |
261 | } else if (codeRange(Hangul_TIni, Hangul_TEnd) || | |
262 | codeRange(HangulT2Ini, HangulT2Fin)) { | |
263 | hangtype = "T"; typelen = 1; | |
264 | } else { | |
265 | hangtype = ""; typelen = 0; | |
266 | } | |
267 | } | |
268 | ||
269 | RETVAL = newSVpvn(hangtype, typelen); | |
270 | OUTPUT: | |
271 | RETVAL | |
272 | ||
273 | ||
274 | void | |
275 | _derivCE_9 (code) | |
276 | UV code | |
277 | ALIAS: | |
278 | _derivCE_14 = 1 | |
279 | _derivCE_18 = 2 | |
280 | _derivCE_20 = 3 | |
281 | _derivCE_22 = 4 | |
cba8842c | 282 | _derivCE_24 = 5 |
fcd4e2f8 | 283 | _derivCE_32 = 6 |
6608d2d5 | 284 | _derivCE_34 = 7 |
eaecde9a | 285 | _derivCE_36 = 8 |
f58b9ef1 CBW |
286 | PREINIT: |
287 | UV base, aaaa, bbbb; | |
126be117 JH |
288 | U8 a[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; |
289 | U8 b[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; | |
eaecde9a | 290 | bool basic_unified = 0, tangut = 0, nushu = 0; |
f58b9ef1 | 291 | PPCODE: |
6608d2d5 | 292 | if (codeRange(CJK_UidIni, CJK_CompFin)) { |
46267efc CBW |
293 | if (codeRange(CJK_CompIni, CJK_CompFin)) |
294 | basic_unified = (bool)UnifiedCompat[code - CJK_CompIni]; | |
f58b9ef1 | 295 | else |
eaecde9a CBW |
296 | basic_unified = (ix >= 8 ? (code <= CJK_UidF100) : |
297 | ix >= 6 ? (code <= CJK_UidF80) : | |
fcd4e2f8 | 298 | ix == 5 ? (code <= CJK_UidF61) : |
cba8842c | 299 | ix >= 3 ? (code <= CJK_UidF52) : |
f58b9ef1 CBW |
300 | ix == 2 ? (code <= CJK_UidF51) : |
301 | ix == 1 ? (code <= CJK_UidF41) : | |
302 | (code <= CJK_UidFin)); | |
eaecde9a CBW |
303 | } else { |
304 | if (ix >= 7) | |
305 | tangut = (codeRange(TangIdeoIni, TangIdeoFin) || | |
306 | codeRange(TangCompIni, TangCompFin)); | |
307 | if (ix >= 8) | |
308 | nushu = (codeRange(NushuIni, NushuFin)); | |
f58b9ef1 | 309 | } |
6608d2d5 CBW |
310 | base = tangut |
311 | ? 0xFB00 : | |
eaecde9a CBW |
312 | nushu |
313 | ? 0xFB01 : | |
6608d2d5 | 314 | basic_unified |
f58b9ef1 CBW |
315 | ? 0xFB40 : /* CJK */ |
316 | ((codeRange(CJK_ExtAIni, CJK_ExtAFin)) | |
317 | || | |
318 | (codeRange(CJK_ExtBIni, CJK_ExtBFin)) | |
319 | || | |
320 | (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin)) | |
321 | || | |
fcd4e2f8 SH |
322 | (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin)) |
323 | || | |
eaecde9a CBW |
324 | (ix >= 6 && codeRange(CJK_ExtEIni, CJK_ExtEFin)) |
325 | || | |
326 | (ix >= 8 && codeRange(CJK_ExtFIni, CJK_ExtFFin))) | |
f58b9ef1 CBW |
327 | ? 0xFB80 /* CJK ext. */ |
328 | : 0xFBC0; /* others */ | |
eaecde9a CBW |
329 | aaaa = tangut || nushu ? base : base + (code >> 15); |
330 | bbbb = (tangut ? (code - TangIdeoIni) : | |
331 | nushu ? (code - NushuIni) : (code & 0x7FFF)) | 0x8000; | |
f58b9ef1 CBW |
332 | a[1] = (U8)(aaaa >> 8); |
333 | a[2] = (U8)(aaaa & 0xFF); | |
334 | b[1] = (U8)(bbbb >> 8); | |
335 | b[2] = (U8)(bbbb & 0xFF); | |
126be117 JH |
336 | a[4] = (U8)(0x20); /* second octet of level 2 */ |
337 | a[6] = (U8)(0x02); /* second octet of level 3 */ | |
f58b9ef1 CBW |
338 | a[7] = b[7] = (U8)(code >> 8); |
339 | a[8] = b[8] = (U8)(code & 0xFF); | |
9d8690d8 CBW |
340 | EXTEND(SP, 2); |
341 | PUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length))); | |
342 | PUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length))); | |
f58b9ef1 CBW |
343 | |
344 | ||
345 | void | |
346 | _derivCE_8 (code) | |
347 | UV code | |
348 | PREINIT: | |
349 | UV aaaa, bbbb; | |
126be117 JH |
350 | U8 a[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; |
351 | U8 b[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; | |
f58b9ef1 CBW |
352 | PPCODE: |
353 | aaaa = 0xFF80 + (code >> 15); | |
354 | bbbb = (code & 0x7FFF) | 0x8000; | |
355 | a[1] = (U8)(aaaa >> 8); | |
356 | a[2] = (U8)(aaaa & 0xFF); | |
357 | b[1] = (U8)(bbbb >> 8); | |
358 | b[2] = (U8)(bbbb & 0xFF); | |
126be117 JH |
359 | a[4] = (U8)(0x02); /* second octet of level 2 */ |
360 | a[6] = (U8)(0x01); /* second octet of level 3 */ | |
f58b9ef1 CBW |
361 | a[7] = b[7] = (U8)(code >> 8); |
362 | a[8] = b[8] = (U8)(code & 0xFF); | |
9d8690d8 CBW |
363 | EXTEND(SP, 2); |
364 | PUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length))); | |
365 | PUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length))); | |
f58b9ef1 CBW |
366 | |
367 | ||
368 | void | |
369 | _uideoCE_8 (code) | |
370 | UV code | |
371 | PREINIT: | |
126be117 | 372 | U8 uice[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; |
f58b9ef1 CBW |
373 | PPCODE: |
374 | uice[1] = uice[7] = (U8)(code >> 8); | |
375 | uice[2] = uice[8] = (U8)(code & 0xFF); | |
126be117 JH |
376 | uice[4] = (U8)(0x20); /* second octet of level 2 */ |
377 | uice[6] = (U8)(0x02); /* second octet of level 3 */ | |
9d8690d8 | 378 | PUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length))); |
f58b9ef1 CBW |
379 | |
380 | ||
381 | SV* | |
382 | _isUIdeo (code, uca_vers) | |
383 | UV code; | |
384 | IV uca_vers; | |
385 | bool basic_unified = 0; | |
386 | CODE: | |
387 | /* uca_vers = 0 for _uideoCE_8() */ | |
388 | if (CJK_UidIni <= code) { | |
46267efc CBW |
389 | if (codeRange(CJK_CompIni, CJK_CompFin)) |
390 | basic_unified = (bool)UnifiedCompat[code - CJK_CompIni]; | |
f58b9ef1 | 391 | else |
eaecde9a CBW |
392 | basic_unified = (uca_vers >= 36 ? (code <= CJK_UidF100) : |
393 | uca_vers >= 32 ? (code <= CJK_UidF80) : | |
fcd4e2f8 | 394 | uca_vers >= 24 ? (code <= CJK_UidF61) : |
cba8842c | 395 | uca_vers >= 20 ? (code <= CJK_UidF52) : |
f58b9ef1 CBW |
396 | uca_vers >= 18 ? (code <= CJK_UidF51) : |
397 | uca_vers >= 14 ? (code <= CJK_UidF41) : | |
398 | (code <= CJK_UidFin)); | |
399 | } | |
400 | RETVAL = boolSV( | |
401 | (basic_unified) | |
402 | || | |
403 | (codeRange(CJK_ExtAIni, CJK_ExtAFin)) | |
404 | || | |
405 | (uca_vers >= 8 && codeRange(CJK_ExtBIni, CJK_ExtBFin)) | |
406 | || | |
407 | (uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin)) | |
408 | || | |
409 | (uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin)) | |
fcd4e2f8 SH |
410 | || |
411 | (uca_vers >= 32 && codeRange(CJK_ExtEIni, CJK_ExtEFin)) | |
eaecde9a CBW |
412 | || |
413 | (uca_vers >= 36 && codeRange(CJK_ExtFIni, CJK_ExtFFin)) | |
f58b9ef1 CBW |
414 | ); |
415 | OUTPUT: | |
416 | RETVAL | |
417 | ||
418 | ||
419 | SV* | |
420 | mk_SortKey (self, buf) | |
421 | SV* self; | |
422 | SV* buf; | |
423 | PREINIT: | |
424 | SV *dst, **svp; | |
425 | STRLEN dlen, vlen; | |
426 | U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel]; | |
427 | AV *bufAV; | |
428 | HV *selfHV; | |
429 | UV back_flag; | |
430 | I32 i, buf_len; | |
431 | IV lv, level, uca_vers; | |
432 | bool upper_lower, kata_hira, v2i, last_is_var; | |
433 | CODE: | |
434 | if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) | |
435 | selfHV = (HV*)SvRV(self); | |
436 | else | |
437 | croak("$self is not a HASHREF."); | |
438 | ||
f58b9ef1 CBW |
439 | if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV) |
440 | bufAV = (AV*)SvRV(buf); | |
441 | else | |
442 | croak("XSUB, not an ARRAYREF."); | |
443 | ||
444 | buf_len = av_len(bufAV); | |
445 | ||
446 | if (buf_len < 0) { /* empty: -1 */ | |
447 | dlen = 2 * (MaxLevel - 1); | |
448 | dst = newSV(dlen); | |
449 | (void)SvPOK_only(dst); | |
450 | d = (U8*)SvPVX(dst); | |
451 | while (dlen--) | |
452 | *d++ = '\0'; | |
19265284 CBW |
453 | } else { |
454 | svp = hv_fetch(selfHV, "level", 5, FALSE); | |
455 | level = svp ? SvIV(*svp) : MaxLevel; | |
456 | ||
f58b9ef1 CBW |
457 | for (lv = 0; lv < level; lv++) { |
458 | New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8); | |
459 | s[lv] = eachlevel[lv]; | |
460 | } | |
461 | ||
462 | svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE); | |
463 | upper_lower = svp ? SvTRUE(*svp) : FALSE; | |
464 | svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE); | |
465 | kata_hira = svp ? SvTRUE(*svp) : FALSE; | |
466 | svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); | |
467 | uca_vers = SvIV(*svp); | |
468 | svp = hv_fetch(selfHV, "variable", 8, FALSE); | |
469 | v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */ | |
470 | ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13)) | |
471 | : FALSE; | |
472 | ||
473 | last_is_var = FALSE; | |
474 | for (i = 0; i <= buf_len; i++) { | |
475 | svp = av_fetch(bufAV, i, FALSE); | |
476 | ||
477 | if (svp && SvPOK(*svp)) | |
478 | v = (U8*)SvPV(*svp, vlen); | |
479 | else | |
480 | croak("not a vwt."); | |
481 | ||
482 | if (vlen < VCE_Length) /* ignore short VCE (unexpected) */ | |
483 | continue; | |
484 | ||
485 | /* "Ignorable (L1, L2) after Variable" since track. v. 9 */ | |
486 | if (v2i) { | |
487 | if (*v) | |
488 | last_is_var = TRUE; | |
489 | else if (v[1] || v[2]) /* non zero primary weight */ | |
490 | last_is_var = FALSE; | |
491 | else if (last_is_var) /* zero primary weight; skipped */ | |
492 | continue; | |
493 | } | |
494 | ||
495 | if (v[5] == 0) { /* tert wt < 256 */ | |
496 | if (upper_lower) { | |
497 | if (0x8 <= v[6] && v[6] <= 0xC) /* lower */ | |
498 | v[6] -= 6; | |
499 | else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */ | |
500 | v[6] += 6; | |
501 | else if (v[6] == 0x1C) /* square upper */ | |
502 | v[6]++; | |
503 | else if (v[6] == 0x1D) /* square lower */ | |
504 | v[6]--; | |
505 | } | |
506 | if (kata_hira) { | |
507 | if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */ | |
508 | v[6] -= 2; | |
509 | else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */ | |
510 | v[6] += 5; | |
511 | } | |
512 | } | |
513 | ||
514 | for (lv = 0; lv < level; lv++) { | |
515 | if (v[2 * lv + 1] || v[2 * lv + 2]) { | |
516 | *s[lv]++ = v[2 * lv + 1]; | |
517 | *s[lv]++ = v[2 * lv + 2]; | |
518 | } | |
519 | } | |
520 | } | |
521 | ||
522 | dlen = 2 * (MaxLevel - 1); | |
523 | for (lv = 0; lv < level; lv++) | |
524 | dlen += s[lv] - eachlevel[lv]; | |
525 | ||
526 | dst = newSV(dlen); | |
527 | (void)SvPOK_only(dst); | |
528 | d = (U8*)SvPVX(dst); | |
529 | ||
530 | svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE); | |
531 | back_flag = svp ? SvUV(*svp) : (UV)0; | |
532 | ||
533 | for (lv = 0; lv < level; lv++) { | |
534 | if (back_flag & (1 << (lv + 1))) { | |
535 | p = s[lv]; | |
536 | e = eachlevel[lv]; | |
537 | for ( ; e < p; p -= 2) { | |
538 | *d++ = p[-2]; | |
539 | *d++ = p[-1]; | |
540 | } | |
541 | } | |
542 | else { | |
543 | p = eachlevel[lv]; | |
544 | e = s[lv]; | |
545 | while (p < e) | |
546 | *d++ = *p++; | |
547 | } | |
548 | if (lv + 1 < MaxLevel) { /* lv + 1 == real level */ | |
549 | *d++ = '\0'; | |
550 | *d++ = '\0'; | |
551 | } | |
552 | } | |
553 | ||
554 | for (lv = level; lv < MaxLevel; lv++) { | |
555 | if (lv + 1 < MaxLevel) { /* lv + 1 == real level */ | |
556 | *d++ = '\0'; | |
557 | *d++ = '\0'; | |
558 | } | |
559 | } | |
560 | ||
561 | for (lv = 0; lv < level; lv++) { | |
562 | Safefree(eachlevel[lv]); | |
563 | } | |
564 | } | |
565 | *d = '\0'; | |
566 | SvCUR_set(dst, d - (U8*)SvPVX(dst)); | |
567 | RETVAL = dst; | |
568 | OUTPUT: | |
569 | RETVAL | |
570 | ||
571 | ||
572 | SV* | |
19265284 CBW |
573 | varCE (self, vce) |
574 | SV* self; | |
575 | SV* vce; | |
f58b9ef1 | 576 | PREINIT: |
19265284 CBW |
577 | SV *dst, *vbl, **svp; |
578 | HV *selfHV; | |
f58b9ef1 CBW |
579 | U8 *a, *v, *d; |
580 | STRLEN alen, vlen; | |
19265284 | 581 | bool ig_l2; |
eaecde9a | 582 | IV uca_vers; |
19265284 | 583 | UV totwt; |
f58b9ef1 | 584 | CODE: |
19265284 CBW |
585 | if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) |
586 | selfHV = (HV*)SvRV(self); | |
587 | else | |
588 | croak("$self is not a HASHREF."); | |
589 | ||
590 | svp = hv_fetch(selfHV, "ignore_level2", 13, FALSE); | |
591 | ig_l2 = svp ? SvTRUE(*svp) : FALSE; | |
592 | ||
593 | svp = hv_fetch(selfHV, "variable", 8, FALSE); | |
594 | vbl = svp ? *svp : &PL_sv_no; | |
f58b9ef1 CBW |
595 | a = (U8*)SvPV(vbl, alen); |
596 | v = (U8*)SvPV(vce, vlen); | |
597 | ||
598 | dst = newSV(vlen); | |
599 | d = (U8*)SvPVX(dst); | |
600 | (void)SvPOK_only(dst); | |
601 | Copy(v, d, vlen, U8); | |
602 | SvCUR_set(dst, vlen); | |
603 | d[vlen] = '\0'; | |
604 | ||
19265284 CBW |
605 | /* primary weight == 0 && secondary weight != 0 */ |
606 | if (ig_l2 && !d[1] && !d[2] && (d[3] || d[4])) { | |
607 | d[3] = d[4] = d[5] = d[6] = '\0'; | |
608 | } | |
609 | ||
f58b9ef1 CBW |
610 | /* variable: checked only the first char and the length, |
611 | trusting checkCollator() and %VariableOK in Perl ... */ | |
612 | ||
f8187d97 SH |
613 | if (vlen >= VCE_Length && *a != 'n') { |
614 | if (*v) { | |
615 | if (*a == 's') { /* shifted or shift-trimmed */ | |
750da838 CBW |
616 | d[7] = d[1]; /* wt level 1 to 4 */ |
617 | d[8] = d[2]; | |
f8187d97 SH |
618 | } /* else blanked */ |
619 | d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0'; | |
620 | } else if (*a == 's') { /* shifted or shift-trimmed */ | |
621 | totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6]; | |
622 | if (alen == 7 && totwt != 0) { /* shifted */ | |
623 | if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */ | |
624 | d[7] = d[1]; /* wt level 1 to 4 */ | |
625 | d[8] = d[2]; | |
626 | } else { | |
eaecde9a CBW |
627 | svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); |
628 | if (!svp) | |
629 | croak("Panic: no $self->{UCA_Version} in varCE"); | |
630 | uca_vers = SvIV(*svp); | |
631 | ||
632 | /* completely ignorable or the second derived CE */ | |
633 | if (uca_vers >= 36 && d[3] + d[4] + d[5] + d[6] == 0) { | |
634 | d[7] = d[8] = '\0'; | |
635 | } else { | |
636 | d[7] = (U8)(Shift4Wt >> 8); | |
637 | d[8] = (U8)(Shift4Wt & 0xFF); | |
638 | } | |
f8187d97 SH |
639 | } |
640 | } else { /* shift-trimmed or completely ignorable */ | |
641 | d[7] = d[8] = '\0'; | |
750da838 | 642 | } |
f8187d97 SH |
643 | } /* else blanked */ |
644 | } /* else non-ignorable */ | |
f58b9ef1 CBW |
645 | RETVAL = dst; |
646 | OUTPUT: | |
647 | RETVAL | |
648 | ||
649 | ||
650 | ||
651 | SV* | |
652 | visualizeSortKey (self, key) | |
653 | SV * self | |
654 | SV * key | |
655 | PREINIT: | |
656 | HV *selfHV; | |
657 | SV **svp, *dst; | |
658 | U8 *s, *e, *d; | |
659 | STRLEN klen, dlen; | |
660 | UV uv; | |
750da838 | 661 | IV uca_vers, sep = 0; |
9d8690d8 | 662 | const char *upperhex = "0123456789ABCDEF"; |
f58b9ef1 CBW |
663 | CODE: |
664 | if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) | |
665 | selfHV = (HV*)SvRV(self); | |
666 | else | |
667 | croak("$self is not a HASHREF."); | |
668 | ||
669 | svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); | |
670 | if (!svp) | |
671 | croak("Panic: no $self->{UCA_Version} in visualizeSortKey"); | |
672 | uca_vers = SvIV(*svp); | |
673 | ||
674 | s = (U8*)SvPV(key, klen); | |
675 | ||
676 | /* slightly *longer* than the need, but I'm afraid of miscounting; | |
750da838 CBW |
677 | = (klen / 2) * 5 - 1 |
678 | # FFFF and ' ' for each 16bit units but ' ' is less by 1; | |
679 | # ' ' and '|' for level boundaries including the identical level | |
680 | + 2 # '[' and ']' | |
681 | + 1 # '\0' | |
682 | (a) if klen is odd (not expected), maybe more 5 bytes. | |
683 | (b) there is not always the identical level. | |
f58b9ef1 CBW |
684 | */ |
685 | dlen = (klen / 2) * 5 + MaxLevel * 2 + 2; | |
686 | dst = newSV(dlen); | |
687 | (void)SvPOK_only(dst); | |
688 | d = (U8*)SvPVX(dst); | |
689 | ||
690 | *d++ = '['; | |
691 | for (e = s + klen; s < e; s += 2) { | |
692 | uv = (U16)(*s << 8 | s[1]); | |
750da838 | 693 | if (uv || sep >= MaxLevel) { |
f58b9ef1 CBW |
694 | if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|'))) |
695 | *d++ = ' '; | |
696 | *d++ = upperhex[ (s[0] >> 4) & 0xF ]; | |
697 | *d++ = upperhex[ s[0] & 0xF ]; | |
698 | *d++ = upperhex[ (s[1] >> 4) & 0xF ]; | |
699 | *d++ = upperhex[ s[1] & 0xF ]; | |
750da838 | 700 | } else { |
f58b9ef1 CBW |
701 | if ((9 <= uca_vers) && (d[-1] != '[')) |
702 | *d++ = ' '; | |
703 | *d++ = '|'; | |
750da838 | 704 | ++sep; |
f58b9ef1 CBW |
705 | } |
706 | } | |
707 | *d++ = ']'; | |
708 | *d = '\0'; | |
709 | SvCUR_set(dst, d - (U8*)SvPVX(dst)); | |
710 | RETVAL = dst; | |
711 | OUTPUT: | |
712 | RETVAL |