This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Treat nmake the same as dmake (expand 6544e9b1a)
[perl5.git] / cpan / Unicode-Normalize / Normalize.pm
CommitLineData
ac5ea531
JH
1package Unicode::Normalize;
2
4a2e806c 3BEGIN {
65e3945f 4 unless ('A' eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c 6 }
65e3945f
CBW
7 unless (0x41 == unpack('U', 'A')) {
8 die "Unicode::Normalize cannot get Unicode code point\n";
9 }
4a2e806c
JH
10}
11
ac5ea531
JH
12use 5.006;
13use strict;
14use warnings;
15use Carp;
16
e524f5b2
NC
17no warnings 'utf8';
18
95f3e8d2 19our $VERSION = '1.18';
ac5ea531
JH
20our $PACKAGE = __PACKAGE__;
21
ac5ea531 22our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45
JH
23our @EXPORT_OK = qw(
24 normalize decompose reorder compose
8f118dcd
JH
25 checkNFD checkNFKD checkNFC checkNFKC check
26 getCanon getCompat getComposite getCombinClass
27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
a96160d6
CBW
29 FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter
30 normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial
8f118dcd
JH
31);
32our %EXPORT_TAGS = (
33 all => [ @EXPORT, @EXPORT_OK ],
34 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
35 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
82e740b6 36 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
2a204b45 37);
ac5ea531 38
fe067ad9 39##
788ba0f4 40## utilities for tests
fe067ad9
SP
41##
42
9f1f04a1 43sub pack_U {
b8d10bc1 44 return pack('U*', @_);
9f1f04a1
RGS
45}
46
47sub unpack_U {
fe067ad9 48 return unpack('U*', shift(@_).pack('U*'));
9f1f04a1
RGS
49}
50
a96160d6
CBW
51require Exporter;
52
95f3e8d2
SH
53our @ISA = qw(Exporter);
54use File::Spec;
55
56our %Combin; # $codepoint => $number : combination class
57our %Canon; # $codepoint => \@codepoints : canonical decomp.
58our %Compat; # $codepoint => \@codepoints : compat. decomp.
59our %Compos; # $1st,$2nd => $codepoint : composite
60our %Exclus; # $codepoint => 1 : composition exclusions
61our %Single; # $codepoint => 1 : singletons
62our %NonStD; # $codepoint => 1 : non-starter decompositions
63our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
64
65# from core Unicode database
66our $Combin = do "unicore/CombiningClass.pl"
67 || do "unicode/CombiningClass.pl"
68 || croak "$PACKAGE: CombiningClass.pl not found";
69our $Decomp = do "unicore/Decomposition.pl"
70 || do "unicode/Decomposition.pl"
71 || croak "$PACKAGE: Decomposition.pl not found";
72
73# CompositionExclusions.txt since Unicode 3.2.0
74our @CompEx = qw(
75 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
76 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
77 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
78 FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
79 FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
80 FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
81 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
82);
83
84# definition of Hangul constants
85use constant SBase => 0xAC00;
86use constant SFinal => 0xD7A3; # SBase -1 + SCount
87use constant SCount => 11172; # LCount * NCount
88use constant NCount => 588; # VCount * TCount
89use constant LBase => 0x1100;
90use constant LFinal => 0x1112;
91use constant LCount => 19;
92use constant VBase => 0x1161;
93use constant VFinal => 0x1175;
94use constant VCount => 21;
95use constant TBase => 0x11A7;
96use constant TFinal => 0x11C2;
97use constant TCount => 28;
98
99sub decomposeHangul {
100 my $sindex = $_[0] - SBase;
101 my $lindex = int( $sindex / NCount);
102 my $vindex = int(($sindex % NCount) / TCount);
103 my $tindex = $sindex % TCount;
104 my @ret = (
105 LBase + $lindex,
106 VBase + $vindex,
107 $tindex ? (TBase + $tindex) : (),
108 );
109 return wantarray ? @ret : pack_U(@ret);
110}
111
112########## getting full decomposition ##########
113
114## converts string "hhhh hhhh hhhh" to a numeric list
115## (hex digits separated by spaces)
116sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
117
118while ($Combin =~ /(.+)/g) {
119 my @tab = split /\t/, $1;
120 my $ini = hex $tab[0];
121 if ($tab[1] eq '') {
122 $Combin{$ini} = $tab[2];
123 } else {
124 $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
125 }
126}
127
128while ($Decomp =~ /(.+)/g) {
129 my @tab = split /\t/, $1;
130 my $compat = $tab[2] =~ s/<[^>]+>//;
131 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
132 my $ini = hex($tab[0]); # initial decomposable character
133 my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
134 # ($ini .. $end) is the range of decomposable characters.
135
136 foreach my $u ($ini .. $end) {
137 $Compat{$u} = $dec;
138 $Canon{$u} = $dec if ! $compat;
139 }
140}
141
142for my $s (@CompEx) {
143 my $u = hex $s;
144 next if !$Canon{$u}; # not assigned
145 next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
146 $Exclus{$u} = 1;
147}
148
149foreach my $u (keys %Canon) {
150 my $dec = $Canon{$u};
151
152 if (@$dec == 2) {
153 if ($Combin{ $dec->[0] }) {
154 $NonStD{$u} = 1;
155 } else {
156 $Compos{ $dec->[0] }{ $dec->[1] } = $u;
157 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
158 }
159 } elsif (@$dec == 1) {
160 $Single{$u} = 1;
161 } else {
162 my $h = sprintf '%04X', $u;
163 croak("Weird Canonical Decomposition of U+$h");
164 }
165}
166
167# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
168foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
169 $Comp2nd{$j} = 1;
170}
171
172sub getCanonList {
173 my @src = @_;
174 my @dec = map {
175 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
176 : $Canon{$_} ? @{ $Canon{$_} } : $_
177 } @src;
178 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
179 # condition @src == @dec is not ok.
180}
181
182sub getCompatList {
183 my @src = @_;
184 my @dec = map {
185 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
186 : $Compat{$_} ? @{ $Compat{$_} } : $_
187 } @src;
188 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
189 # condition @src == @dec is not ok.
190}
191
192# exhaustive decomposition
193foreach my $key (keys %Canon) {
194 $Canon{$key} = [ getCanonList($key) ];
195}
196
197# exhaustive decomposition
198foreach my $key (keys %Compat) {
199 $Compat{$key} = [ getCompatList($key) ];
200}
201
202sub getHangulComposite ($$) {
203 if ((LBase <= $_[0] && $_[0] <= LFinal)
204 && (VBase <= $_[1] && $_[1] <= VFinal)) {
205 my $lindex = $_[0] - LBase;
206 my $vindex = $_[1] - VBase;
207 return (SBase + ($lindex * VCount + $vindex) * TCount);
208 }
209 if ((SBase <= $_[0] && $_[0] <= SFinal && (($_[0] - SBase ) % TCount) == 0)
210 && (TBase < $_[1] && $_[1] <= TFinal)) {
211 return($_[0] + $_[1] - TBase);
212 }
213 return undef;
214}
215
216##########
217
218sub getCombinClass ($) {
219 my $uv = 0 + shift;
220 return $Combin{$uv} || 0;
221}
222
223sub getCanon ($) {
224 my $uv = 0 + shift;
225 return exists $Canon{$uv}
226 ? pack_U(@{ $Canon{$uv} })
227 : (SBase <= $uv && $uv <= SFinal)
228 ? scalar decomposeHangul($uv)
229 : undef;
230}
231
232sub getCompat ($) {
233 my $uv = 0 + shift;
234 return exists $Compat{$uv}
235 ? pack_U(@{ $Compat{$uv} })
236 : (SBase <= $uv && $uv <= SFinal)
237 ? scalar decomposeHangul($uv)
238 : undef;
239}
240
241sub getComposite ($$) {
242 my $uv1 = 0 + shift;
243 my $uv2 = 0 + shift;
244 my $hangul = getHangulComposite($uv1, $uv2);
245 return $hangul if $hangul;
246 return $Compos{ $uv1 } && $Compos{ $uv1 }{ $uv2 };
247}
248
249sub isExclusion ($) {
250 my $uv = 0 + shift;
251 return exists $Exclus{$uv};
252}
253
254sub isSingleton ($) {
255 my $uv = 0 + shift;
256 return exists $Single{$uv};
257}
258
259sub isNonStDecomp($) {
260 my $uv = 0 + shift;
261 return exists $NonStD{$uv};
262}
263
264sub isComp2nd ($) {
265 my $uv = 0 + shift;
266 return exists $Comp2nd{$uv};
267}
268
269sub isNFC_MAYBE ($) {
270 my $uv = 0 + shift;
271 return exists $Comp2nd{$uv};
272}
273
274sub isNFKC_MAYBE($) {
275 my $uv = 0 + shift;
276 return exists $Comp2nd{$uv};
277}
278
279sub isNFD_NO ($) {
280 my $uv = 0 + shift;
281 return exists $Canon {$uv} || (SBase <= $uv && $uv <= SFinal);
282}
283
284sub isNFKD_NO ($) {
285 my $uv = 0 + shift;
286 return exists $Compat{$uv} || (SBase <= $uv && $uv <= SFinal);
287}
288
289sub isComp_Ex ($) {
290 my $uv = 0 + shift;
291 return exists $Exclus{$uv} || exists $Single{$uv} || exists $NonStD{$uv};
292}
293
294sub isNFC_NO ($) {
295 my $uv = 0 + shift;
296 return exists $Exclus{$uv} || exists $Single{$uv} || exists $NonStD{$uv};
297}
298
299sub isNFKC_NO ($) {
300 my $uv = 0 + shift;
301 return 1 if $Exclus{$uv} || $Single{$uv} || $NonStD{$uv};
302 return '' if (SBase <= $uv && $uv <= SFinal) || !exists $Compat{$uv};
303 return 1 if ! exists $Canon{$uv};
304 return pack('N*', @{ $Canon{$uv} }) ne pack('N*', @{ $Compat{$uv} });
305}
306
307##
308## string decompose(string, compat?)
309##
310sub decompose ($;$)
311{
312 my $hash = $_[1] ? \%Compat : \%Canon;
313 return pack_U map {
314 $hash->{ $_ } ? @{ $hash->{ $_ } } :
315 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) : $_
316 } unpack_U($_[0]);
317}
318
319##
320## string reorder(string)
321##
322sub reorder ($)
323{
324 my @src = unpack_U($_[0]);
325
326 for (my $i=0; $i < @src;) {
327 $i++, next if ! $Combin{ $src[$i] };
a96160d6 328
95f3e8d2
SH
329 my $ini = $i;
330 $i++ while $i < @src && $Combin{ $src[$i] };
a96160d6 331
95f3e8d2
SH
332 my @tmp = sort {
333 $Combin{ $src[$a] } <=> $Combin{ $src[$b] } || $a <=> $b
334 } $ini .. $i - 1;
335
336 @src[ $ini .. $i - 1 ] = @src[ @tmp ];
337 }
338 return pack_U(@src);
339}
340
341
342##
343## string compose(string)
344##
345## S : starter; NS : not starter;
346##
347## composable sequence begins at S.
348## S + S or (S + S) + S may be composed.
349## NS + NS must not be composed.
350##
351sub compose ($)
352{
353 my @src = unpack_U($_[0]);
354
355 for (my $s = 0; $s+1 < @src; $s++) {
356 next unless defined $src[$s] && ! $Combin{ $src[$s] };
357 # S only; removed or combining are skipped as a starter.
358
359 my($c, $blocked, $uncomposed_cc);
360 for (my $j = $s+1; $j < @src && !$blocked; $j++) {
361 ($Combin{ $src[$j] } ? $uncomposed_cc : $blocked) = 1;
362
363 # S + C + S => S-S + C would be blocked.
364 next if $blocked && $uncomposed_cc;
365
366 # blocked by same CC (and higher CC: revised D2)
367 next if defined $src[$j-1] && $Combin{ $src[$j-1] }
368 && $Combin{ $src[$j-1] } >= $Combin{ $src[$j] };
369
370 $c = getComposite($src[$s], $src[$j]);
371
372 # no composite or is exclusion
373 next if !$c || $Exclus{$c};
374
375 # replace by composite
376 $src[$s] = $c; $src[$j] = undef;
377 if ($blocked) { $blocked = 0 } else { -- $uncomposed_cc }
378 }
379 }
380 return pack_U(grep defined, @src);
381}
382
383
384##
385## string composeContiguous(string)
386##
387sub composeContiguous ($)
388{
389 my @src = unpack_U($_[0]);
390
391 for (my $s = 0; $s+1 < @src; $s++) {
392 next unless defined $src[$s] && ! $Combin{ $src[$s] };
393 # S only; removed or combining are skipped as a starter.
394
395 for (my $j = $s+1; $j < @src; $j++) {
396 my $c = getComposite($src[$s], $src[$j]);
397
398 # no composite or is exclusion
399 last if !$c || $Exclus{$c};
400
401 # replace by composite
402 $src[$s] = $c; $src[$j] = undef;
403 }
404 }
405 return pack_U(grep defined, @src);
406}
407
408
409##
410## normalization forms
411##
412
413use constant COMPAT => 1;
414
415sub NFD ($) { reorder(decompose($_[0])) }
416sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
417sub NFC ($) { compose(reorder(decompose($_[0]))) }
418sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
419sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }
420
421##
422## quick check
423##
424
425sub checkNFD ($)
426{
427 my $preCC = 0;
428 my $curCC;
429 for my $uv (unpack_U($_[0])) {
430 $curCC = $Combin{ $uv } || 0;
431 return '' if $preCC > $curCC && $curCC != 0;
432 return '' if exists $Canon{$uv} || (SBase <= $uv && $uv <= SFinal);
433 $preCC = $curCC;
434 }
435 return 1;
436}
437
438sub checkNFKD ($)
439{
440 my $preCC = 0;
441 my $curCC;
442 for my $uv (unpack_U($_[0])) {
443 $curCC = $Combin{ $uv } || 0;
444 return '' if $preCC > $curCC && $curCC != 0;
445 return '' if exists $Compat{$uv} || (SBase <= $uv && $uv <= SFinal);
446 $preCC = $curCC;
447 }
448 return 1;
449}
450
451sub checkNFC ($)
452{
453 my $preCC = 0;
454 my($curCC, $isMAYBE);
455 for my $uv (unpack_U($_[0])) {
456 $curCC = $Combin{ $uv } || 0;
457 return '' if $preCC > $curCC && $curCC != 0;
458
459 if (isNFC_MAYBE($uv)) {
460 $isMAYBE = 1;
461 } elsif (isNFC_NO($uv)) {
462 return '';
463 }
464 $preCC = $curCC;
465 }
466 return $isMAYBE ? undef : 1;
467}
468
469sub checkNFKC ($)
470{
471 my $preCC = 0;
472 my($curCC, $isMAYBE);
473 for my $uv (unpack_U($_[0])) {
474 $curCC = $Combin{ $uv } || 0;
475 return '' if $preCC > $curCC && $curCC != 0;
476
477 if (isNFKC_MAYBE($uv)) {
478 $isMAYBE = 1;
479 } elsif (isNFKC_NO($uv)) {
480 return '';
481 }
482 $preCC = $curCC;
483 }
484 return $isMAYBE ? undef : 1;
485}
486
487sub checkFCD ($)
488{
489 my $preCC = 0;
490 my $curCC;
491 for my $uv (unpack_U($_[0])) {
492 # Hangul syllable need not decomposed since cc[any Jamo] == 0;
493 my @uvCan = exists $Canon{$uv} ? @{ $Canon{$uv} } : ($uv);
494
495 $curCC = $Combin{ $uvCan[0] } || 0;
496 return '' if $curCC != 0 && $curCC < $preCC;
497 $preCC = $Combin{ $uvCan[-1] } || 0;
498 }
499 return 1;
500}
501
502sub checkFCC ($)
503{
504 my $preCC = 0;
505 my($curCC, $isMAYBE);
506 for my $uv (unpack_U($_[0])) {
507 # Hangul syllable need not decomposed since cc[any Jamo] == 0;
508 my @uvCan = exists $Canon{$uv} ? @{ $Canon{$uv} } : ($uv);
509
510 $curCC = $Combin{ $uvCan[0] } || 0;
511 return '' if $curCC != 0 && $curCC < $preCC;
512
513 if (isNFC_MAYBE($uv)) {
514 $isMAYBE = 1;
515 } elsif (isNFC_NO($uv)) {
516 return '';
517 }
518
519 $preCC = $Combin{ $uvCan[-1] } || 0;
520 }
521 return $isMAYBE ? undef : 1;
522}
523
524##
525## split on last starter
526##
527
528sub splitOnLastStarter
529{
530 my $str = pack_U(unpack_U(shift));
531 if ($str eq '') {
532 return ('', '');
533 }
534
535 my $ch;
536 my $unproc = "";
537 do {
538 $ch = chop($str);
539 $unproc = $ch.$unproc;
540 } while (getCombinClass(unpack 'U', $ch) && $str ne "");
541 return ($str, $unproc);
542}
82e740b6
NC
543
544##
a96160d6 545## normalize
82e740b6
NC
546##
547
82e740b6
NC
548sub FCD ($) {
549 my $str = shift;
550 return checkFCD($str) ? $str : NFD($str);
551}
82e740b6
NC
552
553our %formNorm = (
554 NFC => \&NFC, C => \&NFC,
555 NFD => \&NFD, D => \&NFD,
556 NFKC => \&NFKC, KC => \&NFKC,
557 NFKD => \&NFKD, KD => \&NFKD,
558 FCD => \&FCD, FCC => \&FCC,
559);
560
ac5ea531
JH
561sub normalize($$)
562{
d85850a7 563 my $form = shift;
f027f502 564 my $str = shift;
fe067ad9
SP
565 if (exists $formNorm{$form}) {
566 return $formNorm{$form}->($str);
567 }
568 croak($PACKAGE."::normalize: invalid form name: $form");
ac5ea531
JH
569}
570
a96160d6
CBW
571##
572## partial
573##
574
575sub normalize_partial ($$) {
576 if (exists $formNorm{$_[0]}) {
577 my $n = normalize($_[0], $_[1]);
578 my($p, $u) = splitOnLastStarter($n);
579 $_[1] = $u;
580 return $p;
581 }
582 croak($PACKAGE."::normalize_partial: invalid form name: $_[0]");
583}
584
585sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) }
586sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) }
587sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) }
588sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) }
82e740b6
NC
589
590##
a96160d6 591## check
82e740b6
NC
592##
593
594our %formCheck = (
595 NFC => \&checkNFC, C => \&checkNFC,
596 NFD => \&checkNFD, D => \&checkNFD,
597 NFKC => \&checkNFKC, KC => \&checkNFKC,
598 NFKD => \&checkNFKD, KD => \&checkNFKD,
599 FCD => \&checkFCD, FCC => \&checkFCC,
600);
601
8f118dcd
JH
602sub check($$)
603{
604 my $form = shift;
f027f502 605 my $str = shift;
fe067ad9
SP
606 if (exists $formCheck{$form}) {
607 return $formCheck{$form}->($str);
608 }
609 croak($PACKAGE."::check: invalid form name: $form");
8f118dcd
JH
610}
611
ac5ea531
JH
6121;
613__END__
2a204b45
JH
614
615=head1 NAME
616
f027f502 617Unicode::Normalize - Unicode Normalization Forms
2a204b45
JH
618
619=head1 SYNOPSIS
620
a092bcfd
RGS
621(1) using function names exported by default:
622
2a204b45
JH
623 use Unicode::Normalize;
624
8f118dcd
JH
625 $NFD_string = NFD($string); # Normalization Form D
626 $NFC_string = NFC($string); # Normalization Form C
627 $NFKD_string = NFKD($string); # Normalization Form KD
628 $NFKC_string = NFKC($string); # Normalization Form KC
2a204b45 629
a092bcfd 630(2) using function names exported on request:
2a204b45
JH
631
632 use Unicode::Normalize 'normalize';
633
8f118dcd
JH
634 $NFD_string = normalize('D', $string); # Normalization Form D
635 $NFC_string = normalize('C', $string); # Normalization Form C
636 $NFKD_string = normalize('KD', $string); # Normalization Form KD
637 $NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45
JH
638
639=head1 DESCRIPTION
640
00f2676f
JH
641Parameters:
642
fe067ad9 643C<$string> is used as a string under character semantics (see F<perlunicode>).
00f2676f 644
fe067ad9 645C<$code_point> should be an unsigned integer representing a Unicode code point.
00f2676f 646
95f3e8d2 647Note: Do not use a floating point nor a negative sign in C<$code_point>.
00f2676f 648
d85850a7 649=head2 Normalization Forms
2a204b45
JH
650
651=over 4
652
8f118dcd 653=item C<$NFD_string = NFD($string)>
2a204b45 654
fe067ad9 655It returns the Normalization Form D (formed by canonical decomposition).
2a204b45 656
8f118dcd 657=item C<$NFC_string = NFC($string)>
2a204b45 658
fe067ad9 659It returns the Normalization Form C (formed by canonical decomposition
2a204b45
JH
660followed by canonical composition).
661
8f118dcd 662=item C<$NFKD_string = NFKD($string)>
2a204b45 663
fe067ad9 664It returns the Normalization Form KD (formed by compatibility decomposition).
2a204b45 665
8f118dcd 666=item C<$NFKC_string = NFKC($string)>
2a204b45 667
fe067ad9 668It returns the Normalization Form KC (formed by compatibility decomposition
2a204b45
JH
669followed by B<canonical> composition).
670
82e740b6
NC
671=item C<$FCD_string = FCD($string)>
672
673If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
fe067ad9 674it returns the string without modification; otherwise it returns an FCD string.
82e740b6
NC
675
676Note: FCD is not always unique, then plural forms may be equivalent
677each other. C<FCD()> will return one of these equivalent forms.
678
679=item C<$FCC_string = FCC($string)>
680
fe067ad9 681It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
82e740b6 682
e524f5b2 683Note: FCC is unique, as well as four normalization forms (NF*).
82e740b6 684
8f118dcd 685=item C<$normalized_string = normalize($form_name, $string)>
2a204b45 686
fe067ad9
SP
687It returns the normalization form of C<$form_name>.
688
2a204b45
JH
689As C<$form_name>, one of the following names must be given.
690
82e740b6
NC
691 'C' or 'NFC' for Normalization Form C (UAX #15)
692 'D' or 'NFD' for Normalization Form D (UAX #15)
693 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
694 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
695
696 'FCD' for "Fast C or D" Form (UTN #5)
697 'FCC' for "Fast C Contiguous" (UTN #5)
2a204b45
JH
698
699=back
700
8f118dcd
JH
701=head2 Decomposition and Composition
702
703=over 4
704
fe067ad9 705=item C<$decomposed_string = decompose($string [, $useCompatMapping])>
8f118dcd 706
fe067ad9
SP
707It returns the concatenation of the decomposition of each character
708in the string.
8f118dcd 709
fe067ad9
SP
710If the second parameter (a boolean) is omitted or false,
711the decomposition is canonical decomposition;
712if the second parameter (a boolean) is true,
713the decomposition is compatibility decomposition.
8f118dcd 714
fe067ad9 715The string returned is not always in NFD/NFKD. Reordering may be required.
8f118dcd
JH
716
717 $NFD_string = reorder(decompose($string)); # eq. to NFD()
718 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
719
fe067ad9 720=item C<$reordered_string = reorder($string)>
8f118dcd 721
fe067ad9
SP
722It returns the result of reordering the combining characters
723according to Canonical Ordering Behavior.
8f118dcd 724
fe067ad9
SP
725For example, when you have a list of NFD/NFKD strings,
726you can get the concatenated NFD/NFKD string from them, by saying
8f118dcd
JH
727
728 $concat_NFD = reorder(join '', @NFD_strings);
729 $concat_NFKD = reorder(join '', @NFKD_strings);
730
fe067ad9 731=item C<$composed_string = compose($string)>
8f118dcd 732
fe067ad9
SP
733It returns the result of canonical composition
734without applying any decomposition.
8f118dcd 735
fe067ad9
SP
736For example, when you have a NFD/NFKD string,
737you can get its NFC/NFKC string, by saying
8f118dcd
JH
738
739 $NFC_string = compose($NFD_string);
740 $NFKC_string = compose($NFKD_string);
741
a96160d6
CBW
742=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)>
743
744It returns two strings: the first one, C<$processed>, is a part
745before the last starter, and the second one, C<$unprocessed> is
746another part after the first part. A starter is a character having
747a combining class of zero (see UAX #15).
748
749Note that C<$processed> may be empty (when C<$normalized> contains no
750starter or starts with the last starter), and then C<$unprocessed>
751should be equal to the entire C<$normalized>.
752
753When you have a C<$normalized> string and an C<$unnormalized> string
754following it, a simple concatenation is wrong:
755
756 $concat = $normalized . normalize($form, $unnormalized); # wrong!
757
758Instead of it, do like this:
759
760 ($processed, $unprocessed) = splitOnLastStarter($normalized);
761 $concat = $processed . normalize($form, $unprocessed.$unnormalized);
762
763C<splitOnLastStarter()> should be called with a pre-normalized parameter
764C<$normalized>, that is in the same form as C<$form> you want.
765
766If you have an array of C<@string> that should be concatenated and then
767normalized, you can do like this:
768
769 my $result = "";
770 my $unproc = "";
771 foreach my $str (@string) {
772 $unproc .= $str;
773 my $n = normalize($form, $unproc);
774 my($p, $u) = splitOnLastStarter($n);
775 $result .= $p;
776 $unproc = $u;
777 }
778 $result .= $unproc;
779 # instead of normalize($form, join('', @string))
780
781=item C<$processed = normalize_partial($form, $unprocessed)>
782
783A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>.
784Note that C<$unprocessed> will be modified as a side-effect.
785
786If you have an array of C<@string> that should be concatenated and then
787normalized, you can do like this:
788
789 my $result = "";
790 my $unproc = "";
791 foreach my $str (@string) {
792 $unproc .= $str;
793 $result .= normalize_partial($form, $unproc);
794 }
795 $result .= $unproc;
796 # instead of normalize($form, join('', @string))
797
798=item C<$processed = NFD_partial($unprocessed)>
799
800It does like C<normalize_partial('NFD', $unprocessed)>.
801Note that C<$unprocessed> will be modified as a side-effect.
802
803=item C<$processed = NFC_partial($unprocessed)>
804
805It does like C<normalize_partial('NFC', $unprocessed)>.
806Note that C<$unprocessed> will be modified as a side-effect.
807
808=item C<$processed = NFKD_partial($unprocessed)>
809
810It does like C<normalize_partial('NFKD', $unprocessed)>.
811Note that C<$unprocessed> will be modified as a side-effect.
812
813=item C<$processed = NFKC_partial($unprocessed)>
814
815It does like C<normalize_partial('NFKC', $unprocessed)>.
816Note that C<$unprocessed> will be modified as a side-effect.
817
8f118dcd
JH
818=back
819
820=head2 Quick Check
821
82e740b6 822(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
8f118dcd
JH
823
824The following functions check whether the string is in that normalization form.
825
fe067ad9 826The result returned will be one of the following:
8f118dcd
JH
827
828 YES The string is in that normalization form.
829 NO The string is not in that normalization form.
830 MAYBE Dubious. Maybe yes, maybe no.
831
832=over 4
833
834=item C<$result = checkNFD($string)>
835
fe067ad9 836It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
8f118dcd
JH
837
838=item C<$result = checkNFC($string)>
839
fe067ad9 840It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 841C<undef> if C<MAYBE>.
8f118dcd
JH
842
843=item C<$result = checkNFKD($string)>
844
fe067ad9 845It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
8f118dcd
JH
846
847=item C<$result = checkNFKC($string)>
848
fe067ad9 849It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 850C<undef> if C<MAYBE>.
8f118dcd 851
82e740b6
NC
852=item C<$result = checkFCD($string)>
853
fe067ad9 854It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
82e740b6
NC
855
856=item C<$result = checkFCC($string)>
857
fe067ad9 858It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 859C<undef> if C<MAYBE>.
82e740b6 860
fe067ad9 861Note: If a string is not in FCD, it must not be in FCC.
82e740b6
NC
862So C<checkFCC($not_FCD_string)> should return C<NO>.
863
8f118dcd
JH
864=item C<$result = check($form_name, $string)>
865
fe067ad9 866It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 867C<undef> if C<MAYBE>.
8f118dcd 868
628bbff0
RGS
869As C<$form_name>, one of the following names must be given.
870
871 'C' or 'NFC' for Normalization Form C (UAX #15)
872 'D' or 'NFD' for Normalization Form D (UAX #15)
873 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
874 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
875
876 'FCD' for "Fast C or D" Form (UTN #5)
877 'FCC' for "Fast C Contiguous" (UTN #5)
8f118dcd
JH
878
879=back
880
881B<Note>
882
82e740b6
NC
883In the cases of NFD, NFKD, and FCD, the answer must be
884either C<YES> or C<NO>. The answer C<MAYBE> may be returned
885in the cases of NFC, NFKC, and FCC.
8f118dcd 886
82e740b6
NC
887A C<MAYBE> string should contain at least one combining character
888or the like. For example, C<COMBINING ACUTE ACCENT> has
8f118dcd 889the MAYBE_NFC/MAYBE_NFKC property.
82e740b6 890
8f118dcd
JH
891Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
892and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502 893C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd
JH
894(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
895while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
896
628bbff0
RGS
897If you want to check exactly, compare the string with its NFC/NFKC/FCC.
898
899 if ($string eq NFC($string)) {
a96160d6 900 # $string is exactly normalized in NFC;
628bbff0 901 } else {
a96160d6 902 # $string is not normalized in NFC;
628bbff0 903 }
8f118dcd 904
628bbff0 905 if ($string eq NFKC($string)) {
a96160d6 906 # $string is exactly normalized in NFKC;
628bbff0 907 } else {
a96160d6 908 # $string is not normalized in NFKC;
628bbff0 909 }
8f118dcd 910
2a204b45
JH
911=head2 Character Data
912
913These functions are interface of character data used internally.
d0ed0342
BG
914If you want only to get Unicode normalization forms, you don't need
915call them yourself.
2a204b45
JH
916
917=over 4
918
fe067ad9 919=item C<$canonical_decomposition = getCanon($code_point)>
2a204b45 920
fe067ad9
SP
921If the character is canonically decomposable (including Hangul Syllables),
922it returns the (full) canonical decomposition as a string.
923Otherwise it returns C<undef>.
8f118dcd 924
fe067ad9
SP
925B<Note:> According to the Unicode standard, the canonical decomposition
926of the character that is not canonically decomposable is same as
927the character itself.
8f118dcd 928
fe067ad9 929=item C<$compatibility_decomposition = getCompat($code_point)>
2a204b45 930
fe067ad9
SP
931If the character is compatibility decomposable (including Hangul Syllables),
932it returns the (full) compatibility decomposition as a string.
933Otherwise it returns C<undef>.
2a204b45 934
fe067ad9
SP
935B<Note:> According to the Unicode standard, the compatibility decomposition
936of the character that is not compatibility decomposable is same as
937the character itself.
2a204b45 938
fe067ad9 939=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
2a204b45 940
fe067ad9 941If two characters here and next (as code points) are composable
8f118dcd 942(including Hangul Jamo/Syllables and Composition Exclusions),
fe067ad9
SP
943it returns the code point of the composite.
944
945If they are not composable, it returns C<undef>.
2a204b45 946
fe067ad9 947=item C<$combining_class = getCombinClass($code_point)>
2a204b45 948
fe067ad9 949It returns the combining class (as an integer) of the character.
2a204b45 950
fe067ad9 951=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
2a204b45 952
fe067ad9
SP
953It returns a boolean whether the character of the specified codepoint
954may be composed with the previous one in a certain composition
955(including Hangul Compositions, but excluding
956Composition Exclusions and Non-Starter Decompositions).
2a204b45 957
fe067ad9 958=item C<$is_exclusion = isExclusion($code_point)>
8f118dcd 959
fe067ad9 960It returns a boolean whether the code point is a composition exclusion.
8f118dcd 961
fe067ad9 962=item C<$is_singleton = isSingleton($code_point)>
8f118dcd 963
fe067ad9 964It returns a boolean whether the code point is a singleton
8f118dcd 965
fe067ad9 966=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
8f118dcd 967
fe067ad9 968It returns a boolean whether the code point has Non-Starter Decomposition.
8f118dcd 969
fe067ad9
SP
970=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
971
972It returns a boolean of the derived property Comp_Ex
973(Full_Composition_Exclusion). This property is generated from
974Composition Exclusions + Singletons + Non-Starter Decompositions.
975
976=item C<$NFD_is_NO = isNFD_NO($code_point)>
977
978It returns a boolean of the derived property NFD_NO
979(NFD_Quick_Check=No).
980
981=item C<$NFC_is_NO = isNFC_NO($code_point)>
982
983It returns a boolean of the derived property NFC_NO
984(NFC_Quick_Check=No).
985
986=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
987
988It returns a boolean of the derived property NFC_MAYBE
989(NFC_Quick_Check=Maybe).
990
991=item C<$NFKD_is_NO = isNFKD_NO($code_point)>
992
993It returns a boolean of the derived property NFKD_NO
994(NFKD_Quick_Check=No).
995
996=item C<$NFKC_is_NO = isNFKC_NO($code_point)>
997
998It returns a boolean of the derived property NFKC_NO
999(NFKC_Quick_Check=No).
1000
1001=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
1002
1003It returns a boolean of the derived property NFKC_MAYBE
1004(NFKC_Quick_Check=Maybe).
2a204b45
JH
1005
1006=back
1007
628bbff0 1008=head1 EXPORT
2a204b45
JH
1009
1010C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
1011
1012C<normalize> and other some functions: on request.
1013
628bbff0
RGS
1014=head1 CAVEATS
1015
1016=over 4
1017
1018=item Perl's version vs. Unicode version
1019
1020Since this module refers to perl core's Unicode database in the directory
1021F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
1022normalization implemented by this module depends on your perl's version.
1023
fe067ad9
SP
1024 perl's version implemented Unicode version
1025 5.6.1 3.0.1
1026 5.7.2 3.1.0
1027 5.7.3 3.1.1 (normalization is same as 3.1.0)
1028 5.8.0 3.2.0
1029 5.8.1-5.8.3 4.0.0
1030 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0)
1031 5.8.7-5.8.8 4.1.0
51683ce6 1032 5.10.0 5.0.0
dd8fe490 1033 5.8.9, 5.10.1 5.1.0
65e3945f 1034 5.12.x 5.2.0
a257baa9
CBW
1035 5.14.x 6.0.0
1036 5.16.x 6.1.0
65e3945f 1037 5.18.x 6.2.0
628bbff0
RGS
1038
1039=item Correction of decomposition mapping
1040
1041In older Unicode versions, a small number of characters (all of which are
1042CJK compatibility ideographs as far as they have been found) may have
1043an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
1044Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
1045nor provide any specific version of normalization. Therefore this module
1046running on an older perl with an older Unicode database may use
1047the erroneous decomposition mapping blindly conforming to the Unicode database.
1048
1049=item Revised definition of canonical composition
1050
1051In Unicode 4.1.0, the definition D2 of canonical composition (which
1052affects NFC and NFKC) has been changed (see Public Review Issue #29
1053and recent UAX #15). This module has used the newer definition
1054since the version 0.07 (Oct 31, 2001).
2b8d773d 1055This module will not support the normalization according to the older
628bbff0
RGS
1056definition, even if the Unicode version implemented by perl is
1057lower than 4.1.0.
1058
1059=back
1060
2a204b45
JH
1061=head1 AUTHOR
1062
a092bcfd 1063SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
2a204b45 1064
b7657688 1065Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45 1066
628bbff0
RGS
1067This module is free software; you can redistribute it
1068and/or modify it under the same terms as Perl itself.
2a204b45
JH
1069
1070=head1 SEE ALSO
1071
1072=over 4
1073
e524f5b2 1074=item http://www.unicode.org/reports/tr15/
2a204b45
JH
1075
1076Unicode Normalization Forms - UAX #15
1077
fe067ad9
SP
1078=item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
1079
1080Composition Exclusion Table
1081
14e6b36c 1082=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd
JH
1083
1084Derived Normalization Properties
1085
628bbff0
RGS
1086=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
1087
1088Normalization Corrections
1089
1090=item http://www.unicode.org/review/pr-29.html
1091
1092Public Review Issue #29: Normalization Issue
1093
82e740b6
NC
1094=item http://www.unicode.org/notes/tn5/
1095
1096Canonical Equivalence in Applications - UTN #5
1097
2a204b45
JH
1098=back
1099
1100=cut