This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade Time::Piece from 1.21 to 1.22
[perl5.git] / cpan / Unicode-Normalize / Normalize.xs
CommitLineData
ac5ea531 1
a257baa9
CBW
2#define PERL_NO_GET_CONTEXT /* we want efficiency */
3
4/* private functions which need pTHX_ and aTHX_
5 pv_cat_decompHangul
6 sv_2pvunicode
7 pv_utf8_decompose
8 pv_utf8_reorder
9 pv_utf8_compose
10*/
11
ac5ea531
JH
12#include "EXTERN.h"
13#include "perl.h"
14#include "XSUB.h"
15
16/* These 5 files are prepared by mkheader */
17#include "unfcmb.h"
18#include "unfcan.h"
19#include "unfcpt.h"
20#include "unfcmp.h"
21#include "unfexc.h"
22
23/* Perl 5.6.1 ? */
24#ifndef uvuni_to_utf8
25#define uvuni_to_utf8 uv_to_utf8
6c941e0c 26#endif /* uvuni_to_utf8 */
ac5ea531
JH
27
28/* Perl 5.6.1 ? */
ab8fe378 29#ifndef utf8n_to_uvuni
788ba0f4 30#define utf8n_to_uvuni utf8_to_uv
6c941e0c 31#endif /* utf8n_to_uvuni */
ac5ea531 32
e524f5b2 33/* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
788ba0f4
CBW
34#ifndef UTF8_ALLOW_BOM
35#define UTF8_ALLOW_BOM (0)
36#endif /* UTF8_ALLOW_BOM */
37
38#ifndef UTF8_ALLOW_SURROGATE
39#define UTF8_ALLOW_SURROGATE (0)
40#endif /* UTF8_ALLOW_SURROGATE */
41
42#ifndef UTF8_ALLOW_FE_FF
43#define UTF8_ALLOW_FE_FF (0)
44#endif /* UTF8_ALLOW_FE_FF */
45
46#ifndef UTF8_ALLOW_FFFF
47#define UTF8_ALLOW_FFFF (0)
48#endif /* UTF8_ALLOW_FFFF */
49
50#define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
51
52/* check if the string buffer is enough before uvuni_to_utf8(). */
53/* dstart, d, and dlen should be defined outside before. */
54#define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \
55 if (dlen < curlen + (need)) { \
56 dlen += (need); \
57 Renew(dstart, dlen+1, U8); \
58 d = dstart + curlen; \
59 }
e524f5b2 60
788ba0f4 61/* if utf8n_to_uvuni() sets retlen to 0 (if broken?) */
fe067ad9 62#define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
82e740b6
NC
63
64/* utf8_hop() hops back before start. Maybe broken UTF-8 */
65#define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
66
ac5ea531
JH
67/* At present, char > 0x10ffff are unaffected without complaint, right? */
68#define VALID_UTF_MAX (0x10ffff)
69#define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
70
fe067ad9
SP
71/* size of array for combining characters */
72/* enough as an initial value? */
73#define CC_SEQ_SIZE (10)
74#define CC_SEQ_STEP (5)
75
76/* HANGUL begin */
ac5ea531
JH
77#define Hangul_SBase 0xAC00
78#define Hangul_SFinal 0xD7A3
79#define Hangul_SCount 11172
80
81#define Hangul_NCount 588
82
83#define Hangul_LBase 0x1100
84#define Hangul_LFinal 0x1112
85#define Hangul_LCount 19
86
87#define Hangul_VBase 0x1161
88#define Hangul_VFinal 0x1175
89#define Hangul_VCount 21
90
91#define Hangul_TBase 0x11A7
92#define Hangul_TFinal 0x11C2
93#define Hangul_TCount 28
94
95#define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
2a204b45 96#define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
ac5ea531
JH
97#define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
98#define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
99#define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
100#define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
fe067ad9 101/* HANGUL end */
ac5ea531
JH
102
103/* this is used for canonical ordering of combining characters (c.c.). */
104typedef struct {
105 U8 cc; /* combining class */
106 UV uv; /* codepoint */
107 STRLEN pos; /* position */
108} UNF_cc;
109
fe067ad9 110static int compare_cc(const void *a, const void *b)
ac5ea531
JH
111{
112 int ret_cc;
6c941e0c 113 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
8f118dcd
JH
114 if (ret_cc)
115 return ret_cc;
6c941e0c
JH
116
117 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
118 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
ac5ea531
JH
119}
120
fe067ad9 121static U8* dec_canonical(UV uv)
ac5ea531
JH
122{
123 U8 ***plane, **row;
8f118dcd
JH
124 if (OVER_UTF_MAX(uv))
125 return NULL;
ac5ea531 126 plane = (U8***)UNF_canon[uv >> 16];
8f118dcd
JH
127 if (! plane)
128 return NULL;
ac5ea531
JH
129 row = plane[(uv >> 8) & 0xff];
130 return row ? row[uv & 0xff] : NULL;
131}
132
fe067ad9 133static U8* dec_compat(UV uv)
ac5ea531
JH
134{
135 U8 ***plane, **row;
8f118dcd
JH
136 if (OVER_UTF_MAX(uv))
137 return NULL;
ac5ea531 138 plane = (U8***)UNF_compat[uv >> 16];
8f118dcd
JH
139 if (! plane)
140 return NULL;
ac5ea531
JH
141 row = plane[(uv >> 8) & 0xff];
142 return row ? row[uv & 0xff] : NULL;
143}
144
fe067ad9 145static UV composite_uv(UV uv, UV uv2)
ac5ea531
JH
146{
147 UNF_complist ***plane, **row, *cell, *i;
148
fe067ad9 149 if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
8f118dcd 150 return 0;
ac5ea531 151
8f118dcd 152 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
fe067ad9
SP
153 UV lindex = uv - Hangul_LBase;
154 UV vindex = uv2 - Hangul_VBase;
155 return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
156 Hangul_TCount);
ac5ea531 157 }
8f118dcd 158 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
fe067ad9
SP
159 UV tindex = uv2 - Hangul_TBase;
160 return(uv + tindex);
ac5ea531
JH
161 }
162 plane = UNF_compos[uv >> 16];
8f118dcd
JH
163 if (! plane)
164 return 0;
ac5ea531 165 row = plane[(uv >> 8) & 0xff];
8f118dcd
JH
166 if (! row)
167 return 0;
ac5ea531 168 cell = row[uv & 0xff];
8f118dcd
JH
169 if (! cell)
170 return 0;
171 for (i = cell; i->nextchar; i++) {
172 if (uv2 == i->nextchar)
173 return i->composite;
ac5ea531
JH
174 }
175 return 0;
176}
177
fe067ad9 178static U8 getCombinClass(UV uv)
ac5ea531
JH
179{
180 U8 **plane, *row;
8f118dcd
JH
181 if (OVER_UTF_MAX(uv))
182 return 0;
ac5ea531 183 plane = (U8**)UNF_combin[uv >> 16];
8f118dcd
JH
184 if (! plane)
185 return 0;
ac5ea531
JH
186 row = plane[(uv >> 8) & 0xff];
187 return row ? row[uv & 0xff] : 0;
188}
189
a257baa9 190static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv)
ac5ea531 191{
fe067ad9
SP
192 UV sindex = uv - Hangul_SBase;
193 UV lindex = sindex / Hangul_NCount;
194 UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
195 UV tindex = sindex % Hangul_TCount;
ac5ea531 196
8f118dcd 197 if (! Hangul_IsS(uv))
fe067ad9 198 return d;
ac5ea531 199
fe067ad9
SP
200 d = uvuni_to_utf8(d, (lindex + Hangul_LBase));
201 d = uvuni_to_utf8(d, (vindex + Hangul_VBase));
8f118dcd 202 if (tindex)
fe067ad9
SP
203 d = uvuni_to_utf8(d, (tindex + Hangul_TBase));
204 return d;
ac5ea531
JH
205}
206
a257baa9 207static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp)
a092bcfd
RGS
208{
209 char *s;
210 STRLEN len;
39f4556f 211 s = SvPV(sv,len);
a092bcfd 212 if (!SvUTF8(sv)) {
39f4556f 213 SV* tmpsv = sv_2mortal(newSVpvn(s, len));
a092bcfd 214 if (!SvPOK(tmpsv))
39f4556f 215 s = SvPV_force(tmpsv,len);
a092bcfd 216 sv_utf8_upgrade(tmpsv);
39f4556f 217 s = SvPV(tmpsv,len);
a092bcfd 218 }
fe067ad9
SP
219 if (lp)
220 *lp = len;
a092bcfd
RGS
221 return s;
222}
223
fe067ad9 224static
a257baa9 225U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
fe067ad9
SP
226{
227 U8* p = s;
228 U8* e = s + slen;
229 U8* dstart = *dp;
230 U8* d = dstart;
231
232 while (p < e) {
233 STRLEN retlen;
234 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 235 if (!retlen)
fe067ad9
SP
236 croak(ErrRetlenIsZero, "decompose");
237 p += retlen;
238
239 if (Hangul_IsS(uv)) {
788ba0f4 240 Renew_d_if_not_enough_to(UTF8_MAXLEN * 3)
a257baa9 241 d = pv_cat_decompHangul(aTHX_ d, uv);
fe067ad9 242 }
ac5ea531 243 else {
fe067ad9
SP
244 U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
245
246 if (r) {
247 STRLEN len = (STRLEN)strlen((char *)r);
788ba0f4 248 Renew_d_if_not_enough_to(len)
fe067ad9
SP
249 while (len--)
250 *d++ = *r++;
251 }
252 else {
788ba0f4 253 Renew_d_if_not_enough_to(UTF8_MAXLEN)
fe067ad9
SP
254 d = uvuni_to_utf8(d, uv);
255 }
ac5ea531
JH
256 }
257 }
fe067ad9
SP
258 *dp = dstart;
259 return d;
260}
ac5ea531 261
fe067ad9 262static
a257baa9 263U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen)
fe067ad9
SP
264{
265 U8* p = s;
266 U8* e = s + slen;
788ba0f4
CBW
267 U8* dstart = *dp;
268 U8* d = dstart;
fe067ad9
SP
269
270 UNF_cc seq_ary[CC_SEQ_SIZE];
271 UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
272 UNF_cc* seq_ext = NULL; /* extend if need */
273 STRLEN seq_max = CC_SEQ_SIZE;
274 STRLEN cc_pos = 0;
275
fe067ad9
SP
276 while (p < e) {
277 U8 curCC;
278 STRLEN retlen;
279 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
280 if (!retlen)
281 croak(ErrRetlenIsZero, "reorder");
282 p += retlen;
ac5ea531 283
fe067ad9 284 curCC = getCombinClass(uv);
ac5ea531 285
fe067ad9
SP
286 if (curCC != 0) {
287 if (seq_max < cc_pos + 1) { /* extend if need */
288 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
289 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
290 STRLEN i;
291 New(0, seq_ext, seq_max, UNF_cc);
292 for (i = 0; i < cc_pos; i++)
293 seq_ext[i] = seq_ary[i];
294 }
295 else {
296 Renew(seq_ext, seq_max, UNF_cc);
297 }
39f4556f 298 seq_ptr = seq_ext; /* use seq_ext from now */
fe067ad9 299 }
a092bcfd 300
fe067ad9
SP
301 seq_ptr[cc_pos].cc = curCC;
302 seq_ptr[cc_pos].uv = uv;
303 seq_ptr[cc_pos].pos = cc_pos;
304 ++cc_pos;
ac5ea531 305
fe067ad9
SP
306 if (p < e)
307 continue;
308 }
ac5ea531 309
788ba0f4 310 /* output */
fe067ad9
SP
311 if (cc_pos) {
312 STRLEN i;
313
314 if (cc_pos > 1) /* reordered if there are two c.c.'s */
315 qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
316
317 for (i = 0; i < cc_pos; i++) {
788ba0f4 318 Renew_d_if_not_enough_to(UTF8_MAXLEN)
fe067ad9 319 d = uvuni_to_utf8(d, seq_ptr[i].uv);
fe067ad9
SP
320 }
321 cc_pos = 0;
322 }
2a204b45 323
e524f5b2 324 if (curCC == 0) {
788ba0f4 325 Renew_d_if_not_enough_to(UTF8_MAXLEN)
e524f5b2 326 d = uvuni_to_utf8(d, uv);
e524f5b2 327 }
fe067ad9
SP
328 }
329 if (seq_ext)
330 Safefree(seq_ext);
788ba0f4 331 *dp = dstart;
fe067ad9
SP
332 return d;
333}
ac5ea531 334
fe067ad9 335static
a257baa9 336U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig)
fe067ad9
SP
337{
338 U8* p = s;
339 U8* e = s + slen;
788ba0f4
CBW
340 U8* dstart = *dp;
341 U8* d = dstart;
fe067ad9 342
39f4556f 343 UV uvS = 0; /* code point of the starter */
fe067ad9
SP
344 bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
345 U8 preCC = 0;
346
347 UV seq_ary[CC_SEQ_SIZE];
348 UV* seq_ptr = seq_ary; /* use array at the beginning */
349 UV* seq_ext = NULL; /* extend if need */
350 STRLEN seq_max = CC_SEQ_SIZE;
351 STRLEN cc_pos = 0;
352
fe067ad9
SP
353 while (p < e) {
354 U8 curCC;
355 STRLEN retlen;
356 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
357 if (!retlen)
358 croak(ErrRetlenIsZero, "compose");
359 p += retlen;
ac5ea531 360
fe067ad9 361 curCC = getCombinClass(uv);
82e740b6 362
fe067ad9 363 if (!valid_uvS) {
e524f5b2 364 if (curCC == 0) {
fe067ad9
SP
365 uvS = uv; /* the first Starter is found */
366 valid_uvS = TRUE;
367 if (p < e)
368 continue;
e524f5b2 369 }
fe067ad9 370 else {
788ba0f4 371 Renew_d_if_not_enough_to(UTF8_MAXLEN)
fe067ad9 372 d = uvuni_to_utf8(d, uv);
fe067ad9
SP
373 continue;
374 }
375 }
376 else {
377 bool composed;
378
379 /* blocked */
c33e8be1
Z
380 if ((iscontig && cc_pos) || /* discontiguous combination */
381 (curCC != 0 && preCC == curCC) || /* blocked by same CC */
788ba0f4 382 (preCC > curCC)) /* blocked by higher CC: revised D2 */
fe067ad9
SP
383 composed = FALSE;
384
385 /* not blocked:
386 iscontig && cc_pos == 0 -- contiguous combination
387 curCC == 0 && preCC == 0 -- starter + starter
388 curCC != 0 && preCC < curCC -- lower CC */
389 else {
390 /* try composition */
391 UV uvComp = composite_uv(uvS, uv);
392
393 if (uvComp && !isExclusion(uvComp)) {
394 uvS = uvComp;
395 composed = TRUE;
82e740b6 396
fe067ad9
SP
397 /* preCC should not be changed to curCC */
398 /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
399 if (p < e)
400 continue;
401 }
402 else
403 composed = FALSE;
404 }
405
406 if (!composed) {
407 preCC = curCC;
408 if (curCC != 0 || !(p < e)) {
409 if (seq_max < cc_pos + 1) { /* extend if need */
410 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
411 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
412 New(0, seq_ext, seq_max, UV);
413 Copy(seq_ary, seq_ext, cc_pos, UV);
414 }
415 else {
416 Renew(seq_ext, seq_max, UV);
417 }
2b8d773d 418 seq_ptr = seq_ext; /* use seq_ext from now */
fe067ad9
SP
419 }
420 seq_ptr[cc_pos] = uv;
421 ++cc_pos;
422 }
423 if (curCC != 0 && p < e)
424 continue;
ac5ea531 425 }
ac5ea531
JH
426 }
427
788ba0f4
CBW
428 /* output */
429 {
430 Renew_d_if_not_enough_to(UTF8_MAXLEN)
431 d = uvuni_to_utf8(d, uvS); /* starter (composed or not) */
432 }
fe067ad9 433
e524f5b2 434 if (cc_pos) {
fe067ad9 435 STRLEN i;
ac5ea531 436
fe067ad9 437 for (i = 0; i < cc_pos; i++) {
788ba0f4 438 Renew_d_if_not_enough_to(UTF8_MAXLEN)
fe067ad9 439 d = uvuni_to_utf8(d, seq_ptr[i]);
fe067ad9
SP
440 }
441 cc_pos = 0;
ac5ea531 442 }
fe067ad9
SP
443
444 uvS = uv;
ac5ea531 445 }
fe067ad9
SP
446 if (seq_ext)
447 Safefree(seq_ext);
788ba0f4 448 *dp = dstart;
fe067ad9
SP
449 return d;
450}
451
452MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
453
454SV*
455decompose(src, compat = &PL_sv_no)
456 SV * src
457 SV * compat
458 PROTOTYPE: $;$
459 PREINIT:
460 SV* dst;
461 U8 *s, *d, *dend;
462 STRLEN slen, dlen;
463 CODE:
a257baa9 464 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
fe067ad9
SP
465 dst = newSVpvn("", 0);
466 dlen = slen;
467 New(0, d, dlen+1, U8);
a257baa9 468 dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat));
9b374906 469 sv_setpvn(dst, (char *)d, dend - d);
fe067ad9
SP
470 SvUTF8_on(dst);
471 Safefree(d);
8f118dcd 472 RETVAL = dst;
ac5ea531
JH
473 OUTPUT:
474 RETVAL
475
788ba0f4 476
fe067ad9
SP
477SV*
478reorder(src)
479 SV * src
480 PROTOTYPE: $
481 PREINIT:
482 SV* dst;
483 U8 *s, *d, *dend;
484 STRLEN slen, dlen;
485 CODE:
a257baa9 486 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
fe067ad9 487 dst = newSVpvn("", 0);
788ba0f4
CBW
488 dlen = slen;
489 New(0, d, dlen+1, U8);
a257baa9 490 dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen);
788ba0f4 491 sv_setpvn(dst, (char *)d, dend - d);
fe067ad9 492 SvUTF8_on(dst);
788ba0f4 493 Safefree(d);
fe067ad9
SP
494 RETVAL = dst;
495 OUTPUT:
496 RETVAL
ac5ea531 497
788ba0f4 498
2a204b45 499SV*
a092bcfd
RGS
500compose(src)
501 SV * src
ac5ea531 502 PROTOTYPE: $
82e740b6
NC
503 ALIAS:
504 composeContiguous = 1
ac5ea531 505 PREINIT:
fe067ad9
SP
506 SV* dst;
507 U8 *s, *d, *dend;
508 STRLEN slen, dlen;
2a204b45 509 CODE:
a257baa9 510 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
fe067ad9 511 dst = newSVpvn("", 0);
788ba0f4
CBW
512 dlen = slen;
513 New(0, d, dlen+1, U8);
a257baa9 514 dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix);
788ba0f4 515 sv_setpvn(dst, (char *)d, dend - d);
ac5ea531 516 SvUTF8_on(dst);
788ba0f4 517 Safefree(d);
fe067ad9
SP
518 RETVAL = dst;
519 OUTPUT:
520 RETVAL
ac5ea531 521
788ba0f4 522
fe067ad9
SP
523SV*
524NFD(src)
525 SV * src
526 PROTOTYPE: $
527 ALIAS:
528 NFKD = 1
529 PREINIT:
530 SV *dst;
531 U8 *s, *t, *tend, *d, *dend;
532 STRLEN slen, tlen, dlen;
533 CODE:
a257baa9 534 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
788ba0f4
CBW
535
536 /* decompose */
fe067ad9
SP
537 tlen = slen;
538 New(0, t, tlen+1, U8);
a257baa9 539 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
fe067ad9 540 *tend = '\0';
788ba0f4 541 tlen = tend - t; /* no longer know real size of t */
fe067ad9
SP
542
543 /* reorder */
788ba0f4
CBW
544 dlen = tlen;
545 New(0, d, dlen+1, U8);
a257baa9 546 dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen);
fe067ad9 547 *dend = '\0';
788ba0f4 548 dlen = dend - d; /* no longer know real size of d */
ac5ea531 549
fe067ad9 550 /* return */
788ba0f4
CBW
551 dst = newSVpvn("", 0);
552 sv_setpvn(dst, (char *)d, dlen);
553 SvUTF8_on(dst);
554
fe067ad9 555 Safefree(t);
788ba0f4 556 Safefree(d);
fe067ad9
SP
557 RETVAL = dst;
558 OUTPUT:
559 RETVAL
82e740b6 560
788ba0f4 561
fe067ad9
SP
562SV*
563NFC(src)
564 SV * src
565 PROTOTYPE: $
566 ALIAS:
567 NFKC = 1
568 FCC = 2
569 PREINIT:
570 SV *dst;
571 U8 *s, *t, *tend, *u, *uend, *d, *dend;
572 STRLEN slen, tlen, ulen, dlen;
573 CODE:
a257baa9 574 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
788ba0f4
CBW
575
576 /* decompose */
fe067ad9
SP
577 tlen = slen;
578 New(0, t, tlen+1, U8);
a257baa9 579 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
fe067ad9 580 *tend = '\0';
788ba0f4 581 tlen = tend - t; /* no longer know real size of t */
fe067ad9
SP
582
583 /* reorder */
788ba0f4 584 ulen = tlen;
fe067ad9 585 New(0, u, ulen+1, U8);
a257baa9 586 uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen);
fe067ad9 587 *uend = '\0';
788ba0f4 588 ulen = uend - u; /* no longer know real size of u */
fe067ad9
SP
589
590 /* compose */
788ba0f4
CBW
591 dlen = ulen;
592 New(0, d, dlen+1, U8);
a257baa9 593 dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2));
fe067ad9 594 *dend = '\0';
788ba0f4 595 dlen = dend - d; /* no longer know real size of d */
ac5ea531 596
fe067ad9 597 /* return */
788ba0f4
CBW
598 dst = newSVpvn("", 0);
599 sv_setpvn(dst, (char *)d, dlen);
600 SvUTF8_on(dst);
601
fe067ad9
SP
602 Safefree(t);
603 Safefree(u);
788ba0f4 604 Safefree(d);
2a204b45
JH
605 RETVAL = dst;
606 OUTPUT:
607 RETVAL
ac5ea531 608
788ba0f4 609
2b8d773d 610SV*
a092bcfd
RGS
611checkNFD(src)
612 SV * src
8f118dcd
JH
613 PROTOTYPE: $
614 ALIAS:
615 checkNFKD = 1
616 PREINIT:
8f118dcd
JH
617 STRLEN srclen, retlen;
618 U8 *s, *e, *p, curCC, preCC;
2b8d773d 619 bool result = TRUE;
82e740b6 620 CODE:
a257baa9 621 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
8f118dcd
JH
622 e = s + srclen;
623
624 preCC = 0;
625 for (p = s; p < e; p += retlen) {
fe067ad9 626 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 627 if (!retlen)
fe067ad9 628 croak(ErrRetlenIsZero, "checkNFD or -NFKD");
82e740b6 629
8f118dcd 630 curCC = getCombinClass(uv);
2b8d773d
RGS
631 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
632 result = FALSE;
633 break;
634 }
635 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
636 result = FALSE;
637 break;
638 }
8f118dcd
JH
639 preCC = curCC;
640 }
2b8d773d
RGS
641 RETVAL = boolSV(result);
642 OUTPUT:
643 RETVAL
8f118dcd
JH
644
645
2b8d773d 646SV*
a092bcfd
RGS
647checkNFC(src)
648 SV * src
8f118dcd
JH
649 PROTOTYPE: $
650 ALIAS:
651 checkNFKC = 1
652 PREINIT:
8f118dcd
JH
653 STRLEN srclen, retlen;
654 U8 *s, *e, *p, curCC, preCC;
2b8d773d
RGS
655 bool result = TRUE;
656 bool isMAYBE = FALSE;
82e740b6 657 CODE:
a257baa9 658 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
8f118dcd
JH
659 e = s + srclen;
660
661 preCC = 0;
8f118dcd 662 for (p = s; p < e; p += retlen) {
fe067ad9 663 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 664 if (!retlen)
fe067ad9 665 croak(ErrRetlenIsZero, "checkNFC or -NFKC");
82e740b6 666
8f118dcd 667 curCC = getCombinClass(uv);
2b8d773d
RGS
668 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
669 result = FALSE;
670 break;
671 }
8f118dcd
JH
672
673 /* get NFC/NFKC property */
674 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
675 ; /* YES */
2b8d773d
RGS
676 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
677 result = FALSE;
678 break;
679 }
8f118dcd
JH
680 else if (isComp2nd(uv))
681 isMAYBE = TRUE;
682 else if (ix) {
683 char *canon, *compat;
6c941e0c 684 /* NFKC_NO when having compatibility mapping. */
8f118dcd
JH
685 canon = (char *) dec_canonical(uv);
686 compat = (char *) dec_compat(uv);
2b8d773d
RGS
687 if (compat && !(canon && strEQ(canon, compat))) {
688 result = FALSE;
689 break;
690 }
8f118dcd
JH
691 } /* end of get NFC/NFKC property */
692
693 preCC = curCC;
694 }
2b8d773d 695 if (isMAYBE && result) /* NO precedes MAYBE */
8f118dcd 696 XSRETURN_UNDEF;
2b8d773d
RGS
697 RETVAL = boolSV(result);
698 OUTPUT:
699 RETVAL
8f118dcd
JH
700
701
2b8d773d 702SV*
a092bcfd
RGS
703checkFCD(src)
704 SV * src
82e740b6
NC
705 PROTOTYPE: $
706 ALIAS:
707 checkFCC = 1
708 PREINIT:
fe067ad9 709 STRLEN srclen, retlen;
82e740b6 710 U8 *s, *e, *p, curCC, preCC;
2b8d773d
RGS
711 bool result = TRUE;
712 bool isMAYBE = FALSE;
82e740b6 713 CODE:
a257baa9 714 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
82e740b6 715 e = s + srclen;
82e740b6 716 preCC = 0;
82e740b6 717 for (p = s; p < e; p += retlen) {
fe067ad9
SP
718 U8 *sCan;
719 UV uvLead;
39f4556f 720 STRLEN canlen = 0;
fe067ad9 721 UV uv = utf8n_to_uvuni(p, e - p, &retlen, AllowAnyUTF);
82e740b6 722 if (!retlen)
fe067ad9 723 croak(ErrRetlenIsZero, "checkFCD or -FCC");
82e740b6
NC
724
725 sCan = (U8*) dec_canonical(uv);
726
727 if (sCan) {
39f4556f 728 STRLEN canret;
82e740b6 729 canlen = (STRLEN)strlen((char *) sCan);
e524f5b2 730 uvLead = utf8n_to_uvuni(sCan, canlen, &canret, AllowAnyUTF);
fe067ad9
SP
731 if (!canret)
732 croak(ErrRetlenIsZero, "checkFCD or -FCC");
82e740b6
NC
733 }
734 else {
735 uvLead = uv;
736 }
737
738 curCC = getCombinClass(uvLead);
739
2b8d773d
RGS
740 if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
741 result = FALSE;
742 break;
743 }
82e740b6
NC
744
745 if (ix) {
2b8d773d
RGS
746 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
747 result = FALSE;
748 break;
749 }
82e740b6
NC
750 else if (isComp2nd(uv))
751 isMAYBE = TRUE;
752 }
753
754 if (sCan) {
39f4556f 755 STRLEN canret;
fe067ad9
SP
756 UV uvTrail;
757 U8* eCan = sCan + canlen;
758 U8* pCan = utf8_hop(eCan, -1);
82e740b6
NC
759 if (pCan < sCan)
760 croak(ErrHopBeforeStart);
e524f5b2 761 uvTrail = utf8n_to_uvuni(pCan, eCan - pCan, &canret, AllowAnyUTF);
fe067ad9
SP
762 if (!canret)
763 croak(ErrRetlenIsZero, "checkFCD or -FCC");
82e740b6
NC
764 preCC = getCombinClass(uvTrail);
765 }
766 else {
767 preCC = curCC;
768 }
769 }
2b8d773d 770 if (isMAYBE && result) /* NO precedes MAYBE */
82e740b6 771 XSRETURN_UNDEF;
2b8d773d
RGS
772 RETVAL = boolSV(result);
773 OUTPUT:
774 RETVAL
82e740b6
NC
775
776
ac5ea531
JH
777U8
778getCombinClass(uv)
779 UV uv
8f118dcd 780 PROTOTYPE: $
ac5ea531
JH
781
782bool
2a204b45 783isExclusion(uv)
ac5ea531 784 UV uv
8f118dcd
JH
785 PROTOTYPE: $
786
787bool
788isSingleton(uv)
789 UV uv
790 PROTOTYPE: $
791
792bool
793isNonStDecomp(uv)
794 UV uv
795 PROTOTYPE: $
796
797bool
798isComp2nd(uv)
799 UV uv
800 PROTOTYPE: $
801 ALIAS:
802 isNFC_MAYBE = 1
803 isNFKC_MAYBE = 2
804
805
806
2b8d773d 807SV*
8f118dcd
JH
808isNFD_NO(uv)
809 UV uv
810 PROTOTYPE: $
811 ALIAS:
812 isNFKD_NO = 1
2b8d773d
RGS
813 PREINIT:
814 bool result = FALSE;
82e740b6 815 CODE:
8f118dcd 816 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
2b8d773d
RGS
817 result = TRUE; /* NFD_NO or NFKD_NO */
818 RETVAL = boolSV(result);
819 OUTPUT:
820 RETVAL
8f118dcd
JH
821
822
2b8d773d 823SV*
8f118dcd
JH
824isComp_Ex(uv)
825 UV uv
826 PROTOTYPE: $
827 ALIAS:
828 isNFC_NO = 0
829 isNFKC_NO = 1
2b8d773d
RGS
830 PREINIT:
831 bool result = FALSE;
82e740b6 832 CODE:
8f118dcd 833 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
2b8d773d 834 result = TRUE; /* NFC_NO or NFKC_NO */
8f118dcd
JH
835 else if (ix) {
836 char *canon, *compat;
837 canon = (char *) dec_canonical(uv);
838 compat = (char *) dec_compat(uv);
839 if (compat && (!canon || strNE(canon, compat)))
2b8d773d 840 result = TRUE; /* NFC_NO or NFKC_NO */
8f118dcd 841 }
2b8d773d
RGS
842 RETVAL = boolSV(result);
843 OUTPUT:
844 RETVAL
ac5ea531 845
2a204b45 846SV*
ac5ea531
JH
847getComposite(uv, uv2)
848 UV uv
849 UV uv2
2a204b45
JH
850 PROTOTYPE: $$
851 PREINIT:
bcdb689b 852 UV composite;
2a204b45 853 CODE:
bcdb689b
JH
854 composite = composite_uv(uv, uv2);
855 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
2a204b45
JH
856 OUTPUT:
857 RETVAL
ac5ea531 858
8f118dcd
JH
859
860
ac5ea531
JH
861SV*
862getCanon(uv)
863 UV uv
864 PROTOTYPE: $
865 ALIAS:
866 getCompat = 1
ac5ea531 867 CODE:
8f118dcd 868 if (Hangul_IsS(uv)) {
fe067ad9
SP
869 U8 tmp[3 * UTF8_MAXLEN + 1];
870 U8 *t = tmp;
a257baa9 871 U8 *e = pv_cat_decompHangul(aTHX_ t, uv);
fe067ad9 872 RETVAL = newSVpvn((char *)t, e - t);
ac5ea531 873 } else {
fe067ad9 874 U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
8f118dcd
JH
875 if (!rstr)
876 XSRETURN_UNDEF;
ac5ea531
JH
877 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
878 }
879 SvUTF8_on(RETVAL);
880 OUTPUT:
881 RETVAL
882
82e740b6
NC
883
884void
a092bcfd
RGS
885splitOnLastStarter(src)
886 SV * src
82e740b6 887 PREINIT:
a092bcfd 888 SV *svp;
fe067ad9 889 STRLEN srclen;
82e740b6
NC
890 U8 *s, *e, *p;
891 PPCODE:
a257baa9 892 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
82e740b6 893 e = s + srclen;
fe067ad9
SP
894 p = e;
895 while (s < p) {
896 UV uv;
82e740b6
NC
897 p = utf8_hop(p, -1);
898 if (p < s)
899 croak(ErrHopBeforeStart);
fe067ad9 900 uv = utf8n_to_uvuni(p, e - p, NULL, AllowAnyUTF);
82e740b6
NC
901 if (getCombinClass(uv) == 0) /* Last Starter found */
902 break;
903 }
904
905 svp = sv_2mortal(newSVpvn((char*)s, p - s));
906 SvUTF8_on(svp);
907 XPUSHs(svp);
908
909 svp = sv_2mortal(newSVpvn((char*)p, e - p));
910 SvUTF8_on(svp);
911 XPUSHs(svp);
912