This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade Unicode::Normalize from version 1.19 to 1.21
[perl5.git] / cpan / Unicode-Normalize / Normalize.pm
CommitLineData
ac5ea531
JH
1package Unicode::Normalize;
2
4a2e806c 3BEGIN {
65e3945f 4 unless ('A' eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c 6 }
65e3945f
CBW
7 unless (0x41 == unpack('U', 'A')) {
8 die "Unicode::Normalize cannot get Unicode code point\n";
9 }
4a2e806c
JH
10}
11
ac5ea531
JH
12use 5.006;
13use strict;
14use warnings;
15use Carp;
16
e524f5b2
NC
17no warnings 'utf8';
18
3ece07bc 19our $VERSION = '1.21';
ac5ea531
JH
20our $PACKAGE = __PACKAGE__;
21
ac5ea531 22our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45
JH
23our @EXPORT_OK = qw(
24 normalize decompose reorder compose
8f118dcd
JH
25 checkNFD checkNFKD checkNFC checkNFKC check
26 getCanon getCompat getComposite getCombinClass
27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
a96160d6
CBW
29 FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter
30 normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial
8f118dcd
JH
31);
32our %EXPORT_TAGS = (
33 all => [ @EXPORT, @EXPORT_OK ],
34 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
35 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
82e740b6 36 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
2a204b45 37);
ac5ea531 38
fe067ad9 39##
788ba0f4 40## utilities for tests
fe067ad9
SP
41##
42
9f1f04a1 43sub pack_U {
b8d10bc1 44 return pack('U*', @_);
9f1f04a1
RGS
45}
46
47sub unpack_U {
3ece07bc
SH
48
49 # The empty pack returns an empty UTF-8 string, so the effect is to force
50 # the shifted parameter into being UTF-8. This shouldn't matter; the
51 # commit messages seem to point to an attempt to get things to work in
52 # EBCDIC in 5.8.
fe067ad9 53 return unpack('U*', shift(@_).pack('U*'));
9f1f04a1
RGS
54}
55
3ece07bc
SH
56BEGIN {
57 # Starting in v5.20, the tables in lib/unicore are built using the
58 # platform's native character set for code points 0-255. Things like the
59 # combining class and compositions exclusions are all above 255, so it
60 # doesn't matter for them.
61
62 *pack_unicore = ($] ge 5.020)
63 ? sub { return pack('W*', @_); }
64 : \&pack_U;
65
66 *unpack_unicore = ($] ge 5.020)
67 ? sub { return unpack('W*', $_[0]); }
68 : \&unpack_U;
69}
70
a96160d6
CBW
71require Exporter;
72
95f3e8d2
SH
73our @ISA = qw(Exporter);
74use File::Spec;
75
76our %Combin; # $codepoint => $number : combination class
77our %Canon; # $codepoint => \@codepoints : canonical decomp.
78our %Compat; # $codepoint => \@codepoints : compat. decomp.
79our %Compos; # $1st,$2nd => $codepoint : composite
80our %Exclus; # $codepoint => 1 : composition exclusions
81our %Single; # $codepoint => 1 : singletons
82our %NonStD; # $codepoint => 1 : non-starter decompositions
83our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
84
85# from core Unicode database
86our $Combin = do "unicore/CombiningClass.pl"
87 || do "unicode/CombiningClass.pl"
88 || croak "$PACKAGE: CombiningClass.pl not found";
89our $Decomp = do "unicore/Decomposition.pl"
90 || do "unicode/Decomposition.pl"
91 || croak "$PACKAGE: Decomposition.pl not found";
92
3ece07bc
SH
93# CompositionExclusions.txt since Unicode 3.2.0. Modern perl versions allow
94# one to get this table from Unicode::UCD, so if it ever changes, it might be
95# better to retrieve it from there, rather than hard-coding it here.
95f3e8d2
SH
96our @CompEx = qw(
97 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
98 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
99 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
100 FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
101 FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
102 FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
103 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
104);
105
106# definition of Hangul constants
107use constant SBase => 0xAC00;
108use constant SFinal => 0xD7A3; # SBase -1 + SCount
109use constant SCount => 11172; # LCount * NCount
110use constant NCount => 588; # VCount * TCount
111use constant LBase => 0x1100;
112use constant LFinal => 0x1112;
113use constant LCount => 19;
114use constant VBase => 0x1161;
115use constant VFinal => 0x1175;
116use constant VCount => 21;
117use constant TBase => 0x11A7;
118use constant TFinal => 0x11C2;
119use constant TCount => 28;
120
121sub decomposeHangul {
122 my $sindex = $_[0] - SBase;
123 my $lindex = int( $sindex / NCount);
124 my $vindex = int(($sindex % NCount) / TCount);
125 my $tindex = $sindex % TCount;
126 my @ret = (
127 LBase + $lindex,
128 VBase + $vindex,
129 $tindex ? (TBase + $tindex) : (),
130 );
3ece07bc 131 return wantarray ? @ret : pack_unicore(@ret);
95f3e8d2
SH
132}
133
134########## getting full decomposition ##########
135
136## converts string "hhhh hhhh hhhh" to a numeric list
137## (hex digits separated by spaces)
138sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
139
140while ($Combin =~ /(.+)/g) {
141 my @tab = split /\t/, $1;
142 my $ini = hex $tab[0];
143 if ($tab[1] eq '') {
144 $Combin{$ini} = $tab[2];
145 } else {
146 $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
147 }
148}
149
150while ($Decomp =~ /(.+)/g) {
151 my @tab = split /\t/, $1;
152 my $compat = $tab[2] =~ s/<[^>]+>//;
153 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
154 my $ini = hex($tab[0]); # initial decomposable character
155 my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
156 # ($ini .. $end) is the range of decomposable characters.
157
158 foreach my $u ($ini .. $end) {
159 $Compat{$u} = $dec;
160 $Canon{$u} = $dec if ! $compat;
161 }
162}
163
164for my $s (@CompEx) {
165 my $u = hex $s;
166 next if !$Canon{$u}; # not assigned
167 next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
168 $Exclus{$u} = 1;
169}
170
171foreach my $u (keys %Canon) {
172 my $dec = $Canon{$u};
173
174 if (@$dec == 2) {
175 if ($Combin{ $dec->[0] }) {
176 $NonStD{$u} = 1;
177 } else {
178 $Compos{ $dec->[0] }{ $dec->[1] } = $u;
179 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
180 }
181 } elsif (@$dec == 1) {
182 $Single{$u} = 1;
183 } else {
184 my $h = sprintf '%04X', $u;
185 croak("Weird Canonical Decomposition of U+$h");
186 }
187}
188
189# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
190foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
191 $Comp2nd{$j} = 1;
192}
193
194sub getCanonList {
195 my @src = @_;
196 my @dec = map {
197 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
198 : $Canon{$_} ? @{ $Canon{$_} } : $_
199 } @src;
200 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
201 # condition @src == @dec is not ok.
202}
203
204sub getCompatList {
205 my @src = @_;
206 my @dec = map {
207 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
208 : $Compat{$_} ? @{ $Compat{$_} } : $_
209 } @src;
210 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
211 # condition @src == @dec is not ok.
212}
213
214# exhaustive decomposition
215foreach my $key (keys %Canon) {
216 $Canon{$key} = [ getCanonList($key) ];
217}
218
219# exhaustive decomposition
220foreach my $key (keys %Compat) {
221 $Compat{$key} = [ getCompatList($key) ];
222}
223
224sub getHangulComposite ($$) {
225 if ((LBase <= $_[0] && $_[0] <= LFinal)
226 && (VBase <= $_[1] && $_[1] <= VFinal)) {
227 my $lindex = $_[0] - LBase;
228 my $vindex = $_[1] - VBase;
229 return (SBase + ($lindex * VCount + $vindex) * TCount);
230 }
231 if ((SBase <= $_[0] && $_[0] <= SFinal && (($_[0] - SBase ) % TCount) == 0)
232 && (TBase < $_[1] && $_[1] <= TFinal)) {
233 return($_[0] + $_[1] - TBase);
234 }
235 return undef;
236}
237
238##########
239
240sub getCombinClass ($) {
241 my $uv = 0 + shift;
242 return $Combin{$uv} || 0;
243}
244
245sub getCanon ($) {
246 my $uv = 0 + shift;
247 return exists $Canon{$uv}
3ece07bc 248 ? pack_unicore(@{ $Canon{$uv} })
95f3e8d2
SH
249 : (SBase <= $uv && $uv <= SFinal)
250 ? scalar decomposeHangul($uv)
251 : undef;
252}
253
254sub getCompat ($) {
255 my $uv = 0 + shift;
256 return exists $Compat{$uv}
3ece07bc 257 ? pack_unicore(@{ $Compat{$uv} })
95f3e8d2
SH
258 : (SBase <= $uv && $uv <= SFinal)
259 ? scalar decomposeHangul($uv)
260 : undef;
261}
262
263sub getComposite ($$) {
264 my $uv1 = 0 + shift;
265 my $uv2 = 0 + shift;
266 my $hangul = getHangulComposite($uv1, $uv2);
267 return $hangul if $hangul;
268 return $Compos{ $uv1 } && $Compos{ $uv1 }{ $uv2 };
269}
270
271sub isExclusion ($) {
272 my $uv = 0 + shift;
273 return exists $Exclus{$uv};
274}
275
276sub isSingleton ($) {
277 my $uv = 0 + shift;
278 return exists $Single{$uv};
279}
280
281sub isNonStDecomp($) {
282 my $uv = 0 + shift;
283 return exists $NonStD{$uv};
284}
285
286sub isComp2nd ($) {
287 my $uv = 0 + shift;
288 return exists $Comp2nd{$uv};
289}
290
291sub isNFC_MAYBE ($) {
292 my $uv = 0 + shift;
293 return exists $Comp2nd{$uv};
294}
295
296sub isNFKC_MAYBE($) {
297 my $uv = 0 + shift;
298 return exists $Comp2nd{$uv};
299}
300
301sub isNFD_NO ($) {
302 my $uv = 0 + shift;
303 return exists $Canon {$uv} || (SBase <= $uv && $uv <= SFinal);
304}
305
306sub isNFKD_NO ($) {
307 my $uv = 0 + shift;
308 return exists $Compat{$uv} || (SBase <= $uv && $uv <= SFinal);
309}
310
311sub isComp_Ex ($) {
312 my $uv = 0 + shift;
313 return exists $Exclus{$uv} || exists $Single{$uv} || exists $NonStD{$uv};
314}
315
316sub isNFC_NO ($) {
317 my $uv = 0 + shift;
318 return exists $Exclus{$uv} || exists $Single{$uv} || exists $NonStD{$uv};
319}
320
321sub isNFKC_NO ($) {
322 my $uv = 0 + shift;
323 return 1 if $Exclus{$uv} || $Single{$uv} || $NonStD{$uv};
324 return '' if (SBase <= $uv && $uv <= SFinal) || !exists $Compat{$uv};
325 return 1 if ! exists $Canon{$uv};
326 return pack('N*', @{ $Canon{$uv} }) ne pack('N*', @{ $Compat{$uv} });
327}
328
329##
330## string decompose(string, compat?)
331##
332sub decompose ($;$)
333{
334 my $hash = $_[1] ? \%Compat : \%Canon;
3ece07bc 335 return pack_unicore map {
95f3e8d2
SH
336 $hash->{ $_ } ? @{ $hash->{ $_ } } :
337 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) : $_
3ece07bc 338 } unpack_unicore($_[0]);
95f3e8d2
SH
339}
340
341##
342## string reorder(string)
343##
344sub reorder ($)
345{
3ece07bc 346 my @src = unpack_unicore($_[0]);
95f3e8d2
SH
347
348 for (my $i=0; $i < @src;) {
349 $i++, next if ! $Combin{ $src[$i] };
a96160d6 350
95f3e8d2
SH
351 my $ini = $i;
352 $i++ while $i < @src && $Combin{ $src[$i] };
a96160d6 353
95f3e8d2
SH
354 my @tmp = sort {
355 $Combin{ $src[$a] } <=> $Combin{ $src[$b] } || $a <=> $b
356 } $ini .. $i - 1;
357
358 @src[ $ini .. $i - 1 ] = @src[ @tmp ];
359 }
3ece07bc 360 return pack_unicore(@src);
95f3e8d2
SH
361}
362
363
364##
365## string compose(string)
366##
367## S : starter; NS : not starter;
368##
369## composable sequence begins at S.
370## S + S or (S + S) + S may be composed.
371## NS + NS must not be composed.
372##
373sub compose ($)
374{
3ece07bc 375 my @src = unpack_unicore($_[0]);
95f3e8d2
SH
376
377 for (my $s = 0; $s+1 < @src; $s++) {
378 next unless defined $src[$s] && ! $Combin{ $src[$s] };
379 # S only; removed or combining are skipped as a starter.
380
381 my($c, $blocked, $uncomposed_cc);
382 for (my $j = $s+1; $j < @src && !$blocked; $j++) {
383 ($Combin{ $src[$j] } ? $uncomposed_cc : $blocked) = 1;
384
385 # S + C + S => S-S + C would be blocked.
386 next if $blocked && $uncomposed_cc;
387
388 # blocked by same CC (and higher CC: revised D2)
389 next if defined $src[$j-1] && $Combin{ $src[$j-1] }
390 && $Combin{ $src[$j-1] } >= $Combin{ $src[$j] };
391
392 $c = getComposite($src[$s], $src[$j]);
393
394 # no composite or is exclusion
395 next if !$c || $Exclus{$c};
396
397 # replace by composite
398 $src[$s] = $c; $src[$j] = undef;
399 if ($blocked) { $blocked = 0 } else { -- $uncomposed_cc }
400 }
401 }
3ece07bc 402 return pack_unicore(grep defined, @src);
95f3e8d2
SH
403}
404
405
406##
407## string composeContiguous(string)
408##
409sub composeContiguous ($)
410{
3ece07bc 411 my @src = unpack_unicore($_[0]);
95f3e8d2
SH
412
413 for (my $s = 0; $s+1 < @src; $s++) {
414 next unless defined $src[$s] && ! $Combin{ $src[$s] };
415 # S only; removed or combining are skipped as a starter.
416
417 for (my $j = $s+1; $j < @src; $j++) {
418 my $c = getComposite($src[$s], $src[$j]);
419
420 # no composite or is exclusion
421 last if !$c || $Exclus{$c};
422
423 # replace by composite
424 $src[$s] = $c; $src[$j] = undef;
425 }
426 }
3ece07bc 427 return pack_unicore(grep defined, @src);
95f3e8d2
SH
428}
429
430
431##
432## normalization forms
433##
434
435use constant COMPAT => 1;
436
437sub NFD ($) { reorder(decompose($_[0])) }
438sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
439sub NFC ($) { compose(reorder(decompose($_[0]))) }
440sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
441sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }
442
443##
444## quick check
445##
446
447sub checkNFD ($)
448{
449 my $preCC = 0;
450 my $curCC;
3ece07bc 451 for my $uv (unpack_unicore($_[0])) {
95f3e8d2
SH
452 $curCC = $Combin{ $uv } || 0;
453 return '' if $preCC > $curCC && $curCC != 0;
454 return '' if exists $Canon{$uv} || (SBase <= $uv && $uv <= SFinal);
455 $preCC = $curCC;
456 }
457 return 1;
458}
459
460sub checkNFKD ($)
461{
462 my $preCC = 0;
463 my $curCC;
3ece07bc 464 for my $uv (unpack_unicore($_[0])) {
95f3e8d2
SH
465 $curCC = $Combin{ $uv } || 0;
466 return '' if $preCC > $curCC && $curCC != 0;
467 return '' if exists $Compat{$uv} || (SBase <= $uv && $uv <= SFinal);
468 $preCC = $curCC;
469 }
470 return 1;
471}
472
473sub checkNFC ($)
474{
475 my $preCC = 0;
476 my($curCC, $isMAYBE);
3ece07bc 477 for my $uv (unpack_unicore($_[0])) {
95f3e8d2
SH
478 $curCC = $Combin{ $uv } || 0;
479 return '' if $preCC > $curCC && $curCC != 0;
480
481 if (isNFC_MAYBE($uv)) {
482 $isMAYBE = 1;
483 } elsif (isNFC_NO($uv)) {
484 return '';
485 }
486 $preCC = $curCC;
487 }
488 return $isMAYBE ? undef : 1;
489}
490
491sub checkNFKC ($)
492{
493 my $preCC = 0;
494 my($curCC, $isMAYBE);
3ece07bc 495 for my $uv (unpack_unicore($_[0])) {
95f3e8d2
SH
496 $curCC = $Combin{ $uv } || 0;
497 return '' if $preCC > $curCC && $curCC != 0;
498
499 if (isNFKC_MAYBE($uv)) {
500 $isMAYBE = 1;
501 } elsif (isNFKC_NO($uv)) {
502 return '';
503 }
504 $preCC = $curCC;
505 }
506 return $isMAYBE ? undef : 1;
507}
508
509sub checkFCD ($)
510{
511 my $preCC = 0;
512 my $curCC;
3ece07bc 513 for my $uv (unpack_unicore($_[0])) {
95f3e8d2
SH
514 # Hangul syllable need not decomposed since cc[any Jamo] == 0;
515 my @uvCan = exists $Canon{$uv} ? @{ $Canon{$uv} } : ($uv);
516
517 $curCC = $Combin{ $uvCan[0] } || 0;
518 return '' if $curCC != 0 && $curCC < $preCC;
519 $preCC = $Combin{ $uvCan[-1] } || 0;
520 }
521 return 1;
522}
523
524sub checkFCC ($)
525{
526 my $preCC = 0;
527 my($curCC, $isMAYBE);
3ece07bc 528 for my $uv (unpack_unicore($_[0])) {
95f3e8d2
SH
529 # Hangul syllable need not decomposed since cc[any Jamo] == 0;
530 my @uvCan = exists $Canon{$uv} ? @{ $Canon{$uv} } : ($uv);
531
532 $curCC = $Combin{ $uvCan[0] } || 0;
533 return '' if $curCC != 0 && $curCC < $preCC;
534
535 if (isNFC_MAYBE($uv)) {
536 $isMAYBE = 1;
537 } elsif (isNFC_NO($uv)) {
538 return '';
539 }
540
541 $preCC = $Combin{ $uvCan[-1] } || 0;
542 }
543 return $isMAYBE ? undef : 1;
544}
545
546##
547## split on last starter
548##
549
550sub splitOnLastStarter
551{
3ece07bc 552 my $str = pack_unicore(unpack_unicore(shift));
95f3e8d2
SH
553 if ($str eq '') {
554 return ('', '');
555 }
556
557 my $ch;
558 my $unproc = "";
559 do {
560 $ch = chop($str);
561 $unproc = $ch.$unproc;
3ece07bc
SH
562 } # Relies on the fact that the combining class for code points < 256 is
563 # 0, so don't have to worry about EBCDIC issues
564 while (getCombinClass(unpack 'U', $ch) && $str ne "");
95f3e8d2
SH
565 return ($str, $unproc);
566}
82e740b6
NC
567
568##
a96160d6 569## normalize
82e740b6
NC
570##
571
82e740b6
NC
572sub FCD ($) {
573 my $str = shift;
574 return checkFCD($str) ? $str : NFD($str);
575}
82e740b6
NC
576
577our %formNorm = (
578 NFC => \&NFC, C => \&NFC,
579 NFD => \&NFD, D => \&NFD,
580 NFKC => \&NFKC, KC => \&NFKC,
581 NFKD => \&NFKD, KD => \&NFKD,
582 FCD => \&FCD, FCC => \&FCC,
583);
584
ac5ea531
JH
585sub normalize($$)
586{
d85850a7 587 my $form = shift;
f027f502 588 my $str = shift;
fe067ad9
SP
589 if (exists $formNorm{$form}) {
590 return $formNorm{$form}->($str);
591 }
592 croak($PACKAGE."::normalize: invalid form name: $form");
ac5ea531
JH
593}
594
a96160d6
CBW
595##
596## partial
597##
598
599sub normalize_partial ($$) {
600 if (exists $formNorm{$_[0]}) {
601 my $n = normalize($_[0], $_[1]);
602 my($p, $u) = splitOnLastStarter($n);
603 $_[1] = $u;
604 return $p;
605 }
606 croak($PACKAGE."::normalize_partial: invalid form name: $_[0]");
607}
608
609sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) }
610sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) }
611sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) }
612sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) }
82e740b6
NC
613
614##
a96160d6 615## check
82e740b6
NC
616##
617
618our %formCheck = (
619 NFC => \&checkNFC, C => \&checkNFC,
620 NFD => \&checkNFD, D => \&checkNFD,
621 NFKC => \&checkNFKC, KC => \&checkNFKC,
622 NFKD => \&checkNFKD, KD => \&checkNFKD,
623 FCD => \&checkFCD, FCC => \&checkFCC,
624);
625
8f118dcd
JH
626sub check($$)
627{
628 my $form = shift;
f027f502 629 my $str = shift;
fe067ad9
SP
630 if (exists $formCheck{$form}) {
631 return $formCheck{$form}->($str);
632 }
633 croak($PACKAGE."::check: invalid form name: $form");
8f118dcd
JH
634}
635
ac5ea531
JH
6361;
637__END__
2a204b45
JH
638
639=head1 NAME
640
f027f502 641Unicode::Normalize - Unicode Normalization Forms
2a204b45
JH
642
643=head1 SYNOPSIS
644
a092bcfd
RGS
645(1) using function names exported by default:
646
2a204b45
JH
647 use Unicode::Normalize;
648
8f118dcd
JH
649 $NFD_string = NFD($string); # Normalization Form D
650 $NFC_string = NFC($string); # Normalization Form C
651 $NFKD_string = NFKD($string); # Normalization Form KD
652 $NFKC_string = NFKC($string); # Normalization Form KC
2a204b45 653
a092bcfd 654(2) using function names exported on request:
2a204b45
JH
655
656 use Unicode::Normalize 'normalize';
657
8f118dcd
JH
658 $NFD_string = normalize('D', $string); # Normalization Form D
659 $NFC_string = normalize('C', $string); # Normalization Form C
660 $NFKD_string = normalize('KD', $string); # Normalization Form KD
661 $NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45
JH
662
663=head1 DESCRIPTION
664
00f2676f
JH
665Parameters:
666
fe067ad9 667C<$string> is used as a string under character semantics (see F<perlunicode>).
00f2676f 668
fe067ad9 669C<$code_point> should be an unsigned integer representing a Unicode code point.
00f2676f 670
95f3e8d2 671Note: Do not use a floating point nor a negative sign in C<$code_point>.
00f2676f 672
d85850a7 673=head2 Normalization Forms
2a204b45
JH
674
675=over 4
676
8f118dcd 677=item C<$NFD_string = NFD($string)>
2a204b45 678
fe067ad9 679It returns the Normalization Form D (formed by canonical decomposition).
2a204b45 680
8f118dcd 681=item C<$NFC_string = NFC($string)>
2a204b45 682
fe067ad9 683It returns the Normalization Form C (formed by canonical decomposition
2a204b45
JH
684followed by canonical composition).
685
8f118dcd 686=item C<$NFKD_string = NFKD($string)>
2a204b45 687
fe067ad9 688It returns the Normalization Form KD (formed by compatibility decomposition).
2a204b45 689
8f118dcd 690=item C<$NFKC_string = NFKC($string)>
2a204b45 691
fe067ad9 692It returns the Normalization Form KC (formed by compatibility decomposition
2a204b45
JH
693followed by B<canonical> composition).
694
82e740b6
NC
695=item C<$FCD_string = FCD($string)>
696
697If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
fe067ad9 698it returns the string without modification; otherwise it returns an FCD string.
82e740b6
NC
699
700Note: FCD is not always unique, then plural forms may be equivalent
701each other. C<FCD()> will return one of these equivalent forms.
702
703=item C<$FCC_string = FCC($string)>
704
fe067ad9 705It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
82e740b6 706
e524f5b2 707Note: FCC is unique, as well as four normalization forms (NF*).
82e740b6 708
8f118dcd 709=item C<$normalized_string = normalize($form_name, $string)>
2a204b45 710
fe067ad9
SP
711It returns the normalization form of C<$form_name>.
712
2a204b45
JH
713As C<$form_name>, one of the following names must be given.
714
82e740b6
NC
715 'C' or 'NFC' for Normalization Form C (UAX #15)
716 'D' or 'NFD' for Normalization Form D (UAX #15)
717 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
718 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
719
720 'FCD' for "Fast C or D" Form (UTN #5)
721 'FCC' for "Fast C Contiguous" (UTN #5)
2a204b45
JH
722
723=back
724
8f118dcd
JH
725=head2 Decomposition and Composition
726
727=over 4
728
fe067ad9 729=item C<$decomposed_string = decompose($string [, $useCompatMapping])>
8f118dcd 730
fe067ad9
SP
731It returns the concatenation of the decomposition of each character
732in the string.
8f118dcd 733
fe067ad9
SP
734If the second parameter (a boolean) is omitted or false,
735the decomposition is canonical decomposition;
736if the second parameter (a boolean) is true,
737the decomposition is compatibility decomposition.
8f118dcd 738
fe067ad9 739The string returned is not always in NFD/NFKD. Reordering may be required.
8f118dcd
JH
740
741 $NFD_string = reorder(decompose($string)); # eq. to NFD()
742 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
743
fe067ad9 744=item C<$reordered_string = reorder($string)>
8f118dcd 745
fe067ad9
SP
746It returns the result of reordering the combining characters
747according to Canonical Ordering Behavior.
8f118dcd 748
fe067ad9
SP
749For example, when you have a list of NFD/NFKD strings,
750you can get the concatenated NFD/NFKD string from them, by saying
8f118dcd
JH
751
752 $concat_NFD = reorder(join '', @NFD_strings);
753 $concat_NFKD = reorder(join '', @NFKD_strings);
754
fe067ad9 755=item C<$composed_string = compose($string)>
8f118dcd 756
fe067ad9
SP
757It returns the result of canonical composition
758without applying any decomposition.
8f118dcd 759
fe067ad9
SP
760For example, when you have a NFD/NFKD string,
761you can get its NFC/NFKC string, by saying
8f118dcd
JH
762
763 $NFC_string = compose($NFD_string);
764 $NFKC_string = compose($NFKD_string);
765
a96160d6
CBW
766=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)>
767
768It returns two strings: the first one, C<$processed>, is a part
769before the last starter, and the second one, C<$unprocessed> is
770another part after the first part. A starter is a character having
771a combining class of zero (see UAX #15).
772
773Note that C<$processed> may be empty (when C<$normalized> contains no
774starter or starts with the last starter), and then C<$unprocessed>
775should be equal to the entire C<$normalized>.
776
777When you have a C<$normalized> string and an C<$unnormalized> string
778following it, a simple concatenation is wrong:
779
780 $concat = $normalized . normalize($form, $unnormalized); # wrong!
781
782Instead of it, do like this:
783
784 ($processed, $unprocessed) = splitOnLastStarter($normalized);
785 $concat = $processed . normalize($form, $unprocessed.$unnormalized);
786
787C<splitOnLastStarter()> should be called with a pre-normalized parameter
788C<$normalized>, that is in the same form as C<$form> you want.
789
790If you have an array of C<@string> that should be concatenated and then
791normalized, you can do like this:
792
793 my $result = "";
794 my $unproc = "";
795 foreach my $str (@string) {
796 $unproc .= $str;
797 my $n = normalize($form, $unproc);
798 my($p, $u) = splitOnLastStarter($n);
799 $result .= $p;
800 $unproc = $u;
801 }
802 $result .= $unproc;
803 # instead of normalize($form, join('', @string))
804
805=item C<$processed = normalize_partial($form, $unprocessed)>
806
807A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>.
808Note that C<$unprocessed> will be modified as a side-effect.
809
810If you have an array of C<@string> that should be concatenated and then
811normalized, you can do like this:
812
813 my $result = "";
814 my $unproc = "";
815 foreach my $str (@string) {
816 $unproc .= $str;
817 $result .= normalize_partial($form, $unproc);
818 }
819 $result .= $unproc;
820 # instead of normalize($form, join('', @string))
821
822=item C<$processed = NFD_partial($unprocessed)>
823
824It does like C<normalize_partial('NFD', $unprocessed)>.
825Note that C<$unprocessed> will be modified as a side-effect.
826
827=item C<$processed = NFC_partial($unprocessed)>
828
829It does like C<normalize_partial('NFC', $unprocessed)>.
830Note that C<$unprocessed> will be modified as a side-effect.
831
832=item C<$processed = NFKD_partial($unprocessed)>
833
834It does like C<normalize_partial('NFKD', $unprocessed)>.
835Note that C<$unprocessed> will be modified as a side-effect.
836
837=item C<$processed = NFKC_partial($unprocessed)>
838
839It does like C<normalize_partial('NFKC', $unprocessed)>.
840Note that C<$unprocessed> will be modified as a side-effect.
841
8f118dcd
JH
842=back
843
844=head2 Quick Check
845
82e740b6 846(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
8f118dcd
JH
847
848The following functions check whether the string is in that normalization form.
849
fe067ad9 850The result returned will be one of the following:
8f118dcd
JH
851
852 YES The string is in that normalization form.
853 NO The string is not in that normalization form.
854 MAYBE Dubious. Maybe yes, maybe no.
855
856=over 4
857
858=item C<$result = checkNFD($string)>
859
fe067ad9 860It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
8f118dcd
JH
861
862=item C<$result = checkNFC($string)>
863
fe067ad9 864It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 865C<undef> if C<MAYBE>.
8f118dcd
JH
866
867=item C<$result = checkNFKD($string)>
868
fe067ad9 869It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
8f118dcd
JH
870
871=item C<$result = checkNFKC($string)>
872
fe067ad9 873It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 874C<undef> if C<MAYBE>.
8f118dcd 875
82e740b6
NC
876=item C<$result = checkFCD($string)>
877
fe067ad9 878It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
82e740b6
NC
879
880=item C<$result = checkFCC($string)>
881
fe067ad9 882It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 883C<undef> if C<MAYBE>.
82e740b6 884
fe067ad9 885Note: If a string is not in FCD, it must not be in FCC.
82e740b6
NC
886So C<checkFCC($not_FCD_string)> should return C<NO>.
887
8f118dcd
JH
888=item C<$result = check($form_name, $string)>
889
fe067ad9 890It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 891C<undef> if C<MAYBE>.
8f118dcd 892
628bbff0
RGS
893As C<$form_name>, one of the following names must be given.
894
895 'C' or 'NFC' for Normalization Form C (UAX #15)
896 'D' or 'NFD' for Normalization Form D (UAX #15)
897 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
898 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
899
900 'FCD' for "Fast C or D" Form (UTN #5)
901 'FCC' for "Fast C Contiguous" (UTN #5)
8f118dcd
JH
902
903=back
904
905B<Note>
906
82e740b6
NC
907In the cases of NFD, NFKD, and FCD, the answer must be
908either C<YES> or C<NO>. The answer C<MAYBE> may be returned
909in the cases of NFC, NFKC, and FCC.
8f118dcd 910
82e740b6
NC
911A C<MAYBE> string should contain at least one combining character
912or the like. For example, C<COMBINING ACUTE ACCENT> has
8f118dcd 913the MAYBE_NFC/MAYBE_NFKC property.
82e740b6 914
8f118dcd
JH
915Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
916and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502 917C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd
JH
918(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
919while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
920
628bbff0
RGS
921If you want to check exactly, compare the string with its NFC/NFKC/FCC.
922
923 if ($string eq NFC($string)) {
a96160d6 924 # $string is exactly normalized in NFC;
628bbff0 925 } else {
a96160d6 926 # $string is not normalized in NFC;
628bbff0 927 }
8f118dcd 928
628bbff0 929 if ($string eq NFKC($string)) {
a96160d6 930 # $string is exactly normalized in NFKC;
628bbff0 931 } else {
a96160d6 932 # $string is not normalized in NFKC;
628bbff0 933 }
8f118dcd 934
2a204b45
JH
935=head2 Character Data
936
937These functions are interface of character data used internally.
d0ed0342
BG
938If you want only to get Unicode normalization forms, you don't need
939call them yourself.
2a204b45
JH
940
941=over 4
942
fe067ad9 943=item C<$canonical_decomposition = getCanon($code_point)>
2a204b45 944
fe067ad9
SP
945If the character is canonically decomposable (including Hangul Syllables),
946it returns the (full) canonical decomposition as a string.
947Otherwise it returns C<undef>.
8f118dcd 948
fe067ad9
SP
949B<Note:> According to the Unicode standard, the canonical decomposition
950of the character that is not canonically decomposable is same as
951the character itself.
8f118dcd 952
fe067ad9 953=item C<$compatibility_decomposition = getCompat($code_point)>
2a204b45 954
fe067ad9
SP
955If the character is compatibility decomposable (including Hangul Syllables),
956it returns the (full) compatibility decomposition as a string.
957Otherwise it returns C<undef>.
2a204b45 958
fe067ad9
SP
959B<Note:> According to the Unicode standard, the compatibility decomposition
960of the character that is not compatibility decomposable is same as
961the character itself.
2a204b45 962
fe067ad9 963=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
2a204b45 964
fe067ad9 965If two characters here and next (as code points) are composable
8f118dcd 966(including Hangul Jamo/Syllables and Composition Exclusions),
fe067ad9
SP
967it returns the code point of the composite.
968
969If they are not composable, it returns C<undef>.
2a204b45 970
fe067ad9 971=item C<$combining_class = getCombinClass($code_point)>
2a204b45 972
fe067ad9 973It returns the combining class (as an integer) of the character.
2a204b45 974
fe067ad9 975=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
2a204b45 976
fe067ad9
SP
977It returns a boolean whether the character of the specified codepoint
978may be composed with the previous one in a certain composition
979(including Hangul Compositions, but excluding
980Composition Exclusions and Non-Starter Decompositions).
2a204b45 981
fe067ad9 982=item C<$is_exclusion = isExclusion($code_point)>
8f118dcd 983
fe067ad9 984It returns a boolean whether the code point is a composition exclusion.
8f118dcd 985
fe067ad9 986=item C<$is_singleton = isSingleton($code_point)>
8f118dcd 987
fe067ad9 988It returns a boolean whether the code point is a singleton
8f118dcd 989
fe067ad9 990=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
8f118dcd 991
fe067ad9 992It returns a boolean whether the code point has Non-Starter Decomposition.
8f118dcd 993
fe067ad9
SP
994=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
995
996It returns a boolean of the derived property Comp_Ex
997(Full_Composition_Exclusion). This property is generated from
998Composition Exclusions + Singletons + Non-Starter Decompositions.
999
1000=item C<$NFD_is_NO = isNFD_NO($code_point)>
1001
1002It returns a boolean of the derived property NFD_NO
1003(NFD_Quick_Check=No).
1004
1005=item C<$NFC_is_NO = isNFC_NO($code_point)>
1006
1007It returns a boolean of the derived property NFC_NO
1008(NFC_Quick_Check=No).
1009
1010=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
1011
1012It returns a boolean of the derived property NFC_MAYBE
1013(NFC_Quick_Check=Maybe).
1014
1015=item C<$NFKD_is_NO = isNFKD_NO($code_point)>
1016
1017It returns a boolean of the derived property NFKD_NO
1018(NFKD_Quick_Check=No).
1019
1020=item C<$NFKC_is_NO = isNFKC_NO($code_point)>
1021
1022It returns a boolean of the derived property NFKC_NO
1023(NFKC_Quick_Check=No).
1024
1025=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
1026
1027It returns a boolean of the derived property NFKC_MAYBE
1028(NFKC_Quick_Check=Maybe).
2a204b45
JH
1029
1030=back
1031
628bbff0 1032=head1 EXPORT
2a204b45
JH
1033
1034C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
1035
1036C<normalize> and other some functions: on request.
1037
628bbff0
RGS
1038=head1 CAVEATS
1039
1040=over 4
1041
1042=item Perl's version vs. Unicode version
1043
1044Since this module refers to perl core's Unicode database in the directory
1045F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
3ece07bc
SH
1046normalization implemented by this module depends on what has been
1047compiled into your perl. The following table lists the default Unicode
1048version that comes with various perl versions. (It is possible to change
1049the Unicode version in any perl version to be any earlier Unicode version,
1050so one could cause Unicode 3.2 to be used in any perl version starting with
10515.8.0. See C<$Config{privlib}>/F<unicore/README.perl>.
628bbff0 1052
fe067ad9
SP
1053 perl's version implemented Unicode version
1054 5.6.1 3.0.1
1055 5.7.2 3.1.0
1056 5.7.3 3.1.1 (normalization is same as 3.1.0)
1057 5.8.0 3.2.0
3ece07bc
SH
1058 5.8.1-5.8.3 4.0.0
1059 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0)
1060 5.8.7-5.8.8 4.1.0
51683ce6 1061 5.10.0 5.0.0
3ece07bc 1062 5.8.9, 5.10.1 5.1.0
65e3945f 1063 5.12.x 5.2.0
a257baa9
CBW
1064 5.14.x 6.0.0
1065 5.16.x 6.1.0
65e3945f 1066 5.18.x 6.2.0
3ece07bc
SH
1067 5.20.x 6.3.0
1068 5.22.x 7.0.0
628bbff0
RGS
1069
1070=item Correction of decomposition mapping
1071
1072In older Unicode versions, a small number of characters (all of which are
1073CJK compatibility ideographs as far as they have been found) may have
1074an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
1075Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
1076nor provide any specific version of normalization. Therefore this module
1077running on an older perl with an older Unicode database may use
1078the erroneous decomposition mapping blindly conforming to the Unicode database.
1079
1080=item Revised definition of canonical composition
1081
1082In Unicode 4.1.0, the definition D2 of canonical composition (which
1083affects NFC and NFKC) has been changed (see Public Review Issue #29
1084and recent UAX #15). This module has used the newer definition
1085since the version 0.07 (Oct 31, 2001).
2b8d773d 1086This module will not support the normalization according to the older
628bbff0
RGS
1087definition, even if the Unicode version implemented by perl is
1088lower than 4.1.0.
1089
1090=back
1091
2a204b45
JH
1092=head1 AUTHOR
1093
a092bcfd 1094SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
2a204b45 1095
b7657688 1096Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45 1097
628bbff0
RGS
1098This module is free software; you can redistribute it
1099and/or modify it under the same terms as Perl itself.
2a204b45
JH
1100
1101=head1 SEE ALSO
1102
1103=over 4
1104
e524f5b2 1105=item http://www.unicode.org/reports/tr15/
2a204b45
JH
1106
1107Unicode Normalization Forms - UAX #15
1108
fe067ad9
SP
1109=item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
1110
1111Composition Exclusion Table
1112
14e6b36c 1113=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd
JH
1114
1115Derived Normalization Properties
1116
628bbff0
RGS
1117=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
1118
1119Normalization Corrections
1120
1121=item http://www.unicode.org/review/pr-29.html
1122
1123Public Review Issue #29: Normalization Issue
1124
82e740b6
NC
1125=item http://www.unicode.org/notes/tn5/
1126
1127Canonical Equivalence in Applications - UTN #5
1128
2a204b45
JH
1129=back
1130
1131=cut