cpan/Unicode-Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 BEGIN {
   4     unless ('A' eq pack('U', 0x41)) {
   5         die "Unicode::Normalize cannot stringify a Unicode code point\n";
   6     }
   7     unless (0x41 == unpack('U', 'A')) {
   8         die "Unicode::Normalize cannot get Unicode code point\n";
   9     }
  10 }
  11
  12 use 5.006;
  13 use strict;
  14 use warnings;
  15 use Carp;
  16
  17 no warnings 'utf8';
  18
  19 our $VERSION = '1.18';
  20 our $PACKAGE = __PACKAGE__;
  21
  22 our @EXPORT = qw( NFC NFD NFKC NFKD );
  23 our @EXPORT_OK = qw(
  24     normalize decompose reorder compose
  25     checkNFD checkNFKD checkNFC checkNFKC check
  26     getCanon getCompat getComposite getCombinClass
  27     isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
  28     isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
  29     FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter
  30     normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial
  31 );
  32 our %EXPORT_TAGS = (
  33     all       => [ @EXPORT, @EXPORT_OK ],
  34     normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
  35     check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  36     fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
  37 );
  38
  39 ##
  40 ## utilities for tests
  41 ##
  42
  43 sub pack_U {
  44     return pack('U*', @_);
  45 }
  46
  47 sub unpack_U {
  48     return unpack('U*', shift(@_).pack('U*'));
  49 }
  50
  51 require Exporter;
  52
  53 our @ISA = qw(Exporter);
  54 use File::Spec;
  55
  56 our %Combin;    # $codepoint => $number    : combination class
  57 our %Canon;     # $codepoint => \@codepoints : canonical decomp.
  58 our %Compat;    # $codepoint => \@codepoints : compat. decomp.
  59 our %Compos;    # $1st,$2nd  => $codepoint : composite
  60 our %Exclus;    # $codepoint => 1          : composition exclusions
  61 our %Single;    # $codepoint => 1          : singletons
  62 our %NonStD;    # $codepoint => 1          : non-starter decompositions
  63 our %Comp2nd;   # $codepoint => 1          : may be composed with a prev char.
  64
  65 # from core Unicode database
  66 our $Combin = do "unicore/CombiningClass.pl"
  67     || do "unicode/CombiningClass.pl"
  68     || croak "$PACKAGE: CombiningClass.pl not found";
  69 our $Decomp = do "unicore/Decomposition.pl"
  70     || do "unicode/Decomposition.pl"
  71     || croak "$PACKAGE: Decomposition.pl not found";
  72
  73 # CompositionExclusions.txt since Unicode 3.2.0
  74 our @CompEx = qw(
  75     0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
  76     0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
  77     0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
  78     FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
  79     FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
  80     FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
  81     1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
  82 );
  83
  84 # definition of Hangul constants
  85 use constant SBase  => 0xAC00;
  86 use constant SFinal => 0xD7A3; # SBase -1 + SCount
  87 use constant SCount =>  11172; # LCount * NCount
  88 use constant NCount =>    588; # VCount * TCount
  89 use constant LBase  => 0x1100;
  90 use constant LFinal => 0x1112;
  91 use constant LCount =>     19;
  92 use constant VBase  => 0x1161;
  93 use constant VFinal => 0x1175;
  94 use constant VCount =>     21;
  95 use constant TBase  => 0x11A7;
  96 use constant TFinal => 0x11C2;
  97 use constant TCount =>     28;
  98
  99 sub decomposeHangul {
 100     my $sindex = $_[0] - SBase;
 101     my $lindex = int( $sindex / NCount);
 102     my $vindex = int(($sindex % NCount) / TCount);
 103     my $tindex =      $sindex % TCount;
 104     my @ret = (
 105        LBase + $lindex,
 106        VBase + $vindex,
 107       $tindex ? (TBase + $tindex) : (),
 108     );
 109     return wantarray ? @ret : pack_U(@ret);
 110 }
 111
 112 ########## getting full decomposition ##########
 113
 114 ## converts string "hhhh hhhh hhhh" to a numeric list
 115 ## (hex digits separated by spaces)
 116 sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
 117
 118 while ($Combin =~ /(.+)/g) {
 119     my @tab = split /\t/, $1;
 120     my $ini = hex $tab[0];
 121     if ($tab[1] eq '') {
 122         $Combin{$ini} = $tab[2];
 123     } else {
 124         $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
 125     }
 126 }
 127
 128 while ($Decomp =~ /(.+)/g) {
 129     my @tab = split /\t/, $1;
 130     my $compat = $tab[2] =~ s/<[^>]+>//;
 131     my $dec = [ _getHexArray($tab[2]) ]; # decomposition
 132     my $ini = hex($tab[0]); # initial decomposable character
 133     my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
 134     # ($ini .. $end) is the range of decomposable characters.
 135
 136     foreach my $u ($ini .. $end) {
 137         $Compat{$u} = $dec;
 138         $Canon{$u} = $dec if ! $compat;
 139     }
 140 }
 141
 142 for my $s (@CompEx) {
 143     my $u = hex $s;
 144     next if !$Canon{$u}; # not assigned
 145     next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
 146     $Exclus{$u} = 1;
 147 }
 148
 149 foreach my $u (keys %Canon) {
 150     my $dec = $Canon{$u};
 151
 152     if (@$dec == 2) {
 153         if ($Combin{ $dec->[0] }) {
 154             $NonStD{$u} = 1;
 155         } else {
 156             $Compos{ $dec->[0] }{ $dec->[1] } = $u;
 157             $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
 158         }
 159     } elsif (@$dec == 1) {
 160         $Single{$u} = 1;
 161     } else {
 162         my $h = sprintf '%04X', $u;
 163         croak("Weird Canonical Decomposition of U+$h");
 164     }
 165 }
 166
 167 # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
 168 foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
 169     $Comp2nd{$j} = 1;
 170 }
 171
 172 sub getCanonList {
 173     my @src = @_;
 174     my @dec = map {
 175         (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
 176             : $Canon{$_} ? @{ $Canon{$_} } : $_
 177                 } @src;
 178     return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
 179     # condition @src == @dec is not ok.
 180 }
 181
 182 sub getCompatList {
 183     my @src = @_;
 184     my @dec = map {
 185         (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
 186             : $Compat{$_} ? @{ $Compat{$_} } : $_
 187                 } @src;
 188     return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
 189     # condition @src == @dec is not ok.
 190 }
 191
 192 # exhaustive decomposition
 193 foreach my $key (keys %Canon) {
 194     $Canon{$key}  = [ getCanonList($key) ];
 195 }
 196
 197 # exhaustive decomposition
 198 foreach my $key (keys %Compat) {
 199     $Compat{$key} = [ getCompatList($key) ];
 200 }
 201
 202 sub getHangulComposite ($$) {
 203     if ((LBase <= $_[0] && $_[0] <= LFinal)
 204      && (VBase <= $_[1] && $_[1] <= VFinal)) {
 205         my $lindex = $_[0] - LBase;
 206         my $vindex = $_[1] - VBase;
 207         return (SBase + ($lindex * VCount + $vindex) * TCount);
 208     }
 209     if ((SBase <= $_[0] && $_[0] <= SFinal && (($_[0] - SBase ) % TCount) == 0)
 210      && (TBase  < $_[1] && $_[1] <= TFinal)) {
 211         return($_[0] + $_[1] - TBase);
 212     }
 213     return undef;
 214 }
 215
 216 ##########
 217
 218 sub getCombinClass ($) {
 219     my $uv = 0 + shift;
 220     return $Combin{$uv} || 0;
 221 }
 222
 223 sub getCanon ($) {
 224     my $uv = 0 + shift;
 225     return exists $Canon{$uv}
 226         ? pack_U(@{ $Canon{$uv} })
 227         : (SBase <= $uv && $uv <= SFinal)
 228             ? scalar decomposeHangul($uv)
 229             : undef;
 230 }
 231
 232 sub getCompat ($) {
 233     my $uv = 0 + shift;
 234     return exists $Compat{$uv}
 235         ? pack_U(@{ $Compat{$uv} })
 236         : (SBase <= $uv && $uv <= SFinal)
 237             ? scalar decomposeHangul($uv)
 238             : undef;
 239 }
 240
 241 sub getComposite ($$) {
 242     my $uv1 = 0 + shift;
 243     my $uv2 = 0 + shift;
 244     my $hangul = getHangulComposite($uv1, $uv2);
 245     return $hangul if $hangul;
 246     return $Compos{ $uv1 } && $Compos{ $uv1 }{ $uv2 };
 247 }
 248
 249 sub isExclusion  ($) {
 250     my $uv = 0 + shift;
 251     return exists $Exclus{$uv};
 252 }
 253
 254 sub isSingleton  ($) {
 255     my $uv = 0 + shift;
 256     return exists $Single{$uv};
 257 }
 258
 259 sub isNonStDecomp($) {
 260     my $uv = 0 + shift;
 261     return exists $NonStD{$uv};
 262 }
 263
 264 sub isComp2nd ($) {
 265     my $uv = 0 + shift;
 266     return exists $Comp2nd{$uv};
 267 }
 268
 269 sub isNFC_MAYBE ($) {
 270     my $uv = 0 + shift;
 271     return exists $Comp2nd{$uv};
 272 }
 273
 274 sub isNFKC_MAYBE($) {
 275     my $uv = 0 + shift;
 276     return exists $Comp2nd{$uv};
 277 }
 278
 279 sub isNFD_NO ($) {
 280     my $uv = 0 + shift;
 281     return exists $Canon {$uv} || (SBase <= $uv && $uv <= SFinal);
 282 }
 283
 284 sub isNFKD_NO ($) {
 285     my $uv = 0 + shift;
 286     return exists $Compat{$uv} || (SBase <= $uv && $uv <= SFinal);
 287 }
 288
 289 sub isComp_Ex ($) {
 290     my $uv = 0 + shift;
 291     return exists $Exclus{$uv} || exists $Single{$uv} || exists $NonStD{$uv};
 292 }
 293
 294 sub isNFC_NO ($) {
 295     my $uv = 0 + shift;
 296     return exists $Exclus{$uv} || exists $Single{$uv} || exists $NonStD{$uv};
 297 }
 298
 299 sub isNFKC_NO ($) {
 300     my $uv = 0 + shift;
 301     return 1  if $Exclus{$uv} || $Single{$uv} || $NonStD{$uv};
 302     return '' if (SBase <= $uv && $uv <= SFinal) || !exists $Compat{$uv};
 303     return 1  if ! exists $Canon{$uv};
 304     return pack('N*', @{ $Canon{$uv} }) ne pack('N*', @{ $Compat{$uv} });
 305 }
 306
 307 ##
 308 ## string decompose(string, compat?)
 309 ##
 310 sub decompose ($;$)
 311 {
 312     my $hash = $_[1] ? \%Compat : \%Canon;
 313     return pack_U map {
 314         $hash->{ $_ } ? @{ $hash->{ $_ } } :
 315             (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) : $_
 316     } unpack_U($_[0]);
 317 }
 318
 319 ##
 320 ## string reorder(string)
 321 ##
 322 sub reorder ($)
 323 {
 324     my @src = unpack_U($_[0]);
 325
 326     for (my $i=0; $i < @src;) {
 327         $i++, next if ! $Combin{ $src[$i] };
 328
 329         my $ini = $i;
 330         $i++ while $i < @src && $Combin{ $src[$i] };
 331
 332         my @tmp = sort {
 333                 $Combin{ $src[$a] } <=> $Combin{ $src[$b] } || $a <=> $b
 334             } $ini .. $i - 1;
 335
 336         @src[ $ini .. $i - 1 ] = @src[ @tmp ];
 337     }
 338     return pack_U(@src);
 339 }
 340
 341
 342 ##
 343 ## string compose(string)
 344 ##
 345 ## S : starter; NS : not starter;
 346 ##
 347 ## composable sequence begins at S.
 348 ## S + S or (S + S) + S may be composed.
 349 ## NS + NS must not be composed.
 350 ##
 351 sub compose ($)
 352 {
 353     my @src = unpack_U($_[0]);
 354
 355     for (my $s = 0; $s+1 < @src; $s++) {
 356         next unless defined $src[$s] && ! $Combin{ $src[$s] };
 357          # S only; removed or combining are skipped as a starter.
 358
 359         my($c, $blocked, $uncomposed_cc);
 360         for (my $j = $s+1; $j < @src && !$blocked; $j++) {
 361             ($Combin{ $src[$j] } ? $uncomposed_cc : $blocked) = 1;
 362
 363             # S + C + S => S-S + C would be blocked.
 364             next if $blocked && $uncomposed_cc;
 365
 366             # blocked by same CC (and higher CC: revised D2)
 367             next if defined $src[$j-1]   && $Combin{ $src[$j-1] }
 368                 && $Combin{ $src[$j-1] } >= $Combin{ $src[$j] };
 369
 370             $c = getComposite($src[$s], $src[$j]);
 371
 372             # no composite or is exclusion
 373             next if !$c || $Exclus{$c};
 374
 375             # replace by composite
 376             $src[$s] = $c; $src[$j] = undef;
 377             if ($blocked) { $blocked = 0 } else { -- $uncomposed_cc }
 378         }
 379     }
 380     return pack_U(grep defined, @src);
 381 }
 382
 383
 384 ##
 385 ## string composeContiguous(string)
 386 ##
 387 sub composeContiguous ($)
 388 {
 389     my @src = unpack_U($_[0]);
 390
 391     for (my $s = 0; $s+1 < @src; $s++) {
 392         next unless defined $src[$s] && ! $Combin{ $src[$s] };
 393          # S only; removed or combining are skipped as a starter.
 394
 395         for (my $j = $s+1; $j < @src; $j++) {
 396             my $c = getComposite($src[$s], $src[$j]);
 397
 398             # no composite or is exclusion
 399             last if !$c || $Exclus{$c};
 400
 401             # replace by composite
 402             $src[$s] = $c; $src[$j] = undef;
 403         }
 404     }
 405     return pack_U(grep defined, @src);
 406 }
 407
 408
 409 ##
 410 ## normalization forms
 411 ##
 412
 413 use constant COMPAT => 1;
 414
 415 sub NFD  ($) { reorder(decompose($_[0])) }
 416 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
 417 sub NFC  ($) { compose(reorder(decompose($_[0]))) }
 418 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
 419 sub FCC  ($) { composeContiguous(reorder(decompose($_[0]))) }
 420
 421 ##
 422 ## quick check
 423 ##
 424
 425 sub checkNFD ($)
 426 {
 427     my $preCC = 0;
 428     my $curCC;
 429     for my $uv (unpack_U($_[0])) {
 430         $curCC = $Combin{ $uv } || 0;
 431         return '' if $preCC > $curCC && $curCC != 0;
 432         return '' if exists $Canon{$uv} || (SBase <= $uv && $uv <= SFinal);
 433         $preCC = $curCC;
 434     }
 435     return 1;
 436 }
 437
 438 sub checkNFKD ($)
 439 {
 440     my $preCC = 0;
 441     my $curCC;
 442     for my $uv (unpack_U($_[0])) {
 443         $curCC = $Combin{ $uv } || 0;
 444         return '' if $preCC > $curCC && $curCC != 0;
 445         return '' if exists $Compat{$uv} || (SBase <= $uv && $uv <= SFinal);
 446         $preCC = $curCC;
 447     }
 448     return 1;
 449 }
 450
 451 sub checkNFC ($)
 452 {
 453     my $preCC = 0;
 454     my($curCC, $isMAYBE);
 455     for my $uv (unpack_U($_[0])) {
 456         $curCC = $Combin{ $uv } || 0;
 457         return '' if $preCC > $curCC && $curCC != 0;
 458
 459         if (isNFC_MAYBE($uv)) {
 460             $isMAYBE = 1;
 461         } elsif (isNFC_NO($uv)) {
 462             return '';
 463         }
 464         $preCC = $curCC;
 465     }
 466     return $isMAYBE ? undef : 1;
 467 }
 468
 469 sub checkNFKC ($)
 470 {
 471     my $preCC = 0;
 472     my($curCC, $isMAYBE);
 473     for my $uv (unpack_U($_[0])) {
 474         $curCC = $Combin{ $uv } || 0;
 475         return '' if $preCC > $curCC && $curCC != 0;
 476
 477         if (isNFKC_MAYBE($uv)) {
 478             $isMAYBE = 1;
 479         } elsif (isNFKC_NO($uv)) {
 480             return '';
 481         }
 482         $preCC = $curCC;
 483     }
 484     return $isMAYBE ? undef : 1;
 485 }
 486
 487 sub checkFCD ($)
 488 {
 489     my $preCC = 0;
 490     my $curCC;
 491     for my $uv (unpack_U($_[0])) {
 492         # Hangul syllable need not decomposed since cc[any Jamo] == 0;
 493         my @uvCan = exists $Canon{$uv} ? @{ $Canon{$uv} } : ($uv);
 494
 495         $curCC = $Combin{ $uvCan[0] } || 0;
 496         return '' if $curCC != 0 && $curCC < $preCC;
 497         $preCC = $Combin{ $uvCan[-1] } || 0;
 498     }
 499     return 1;
 500 }
 501
 502 sub checkFCC ($)
 503 {
 504     my $preCC = 0;
 505     my($curCC, $isMAYBE);
 506     for my $uv (unpack_U($_[0])) {
 507         # Hangul syllable need not decomposed since cc[any Jamo] == 0;
 508         my @uvCan = exists $Canon{$uv} ? @{ $Canon{$uv} } : ($uv);
 509
 510         $curCC = $Combin{ $uvCan[0] } || 0;
 511         return '' if $curCC != 0 && $curCC < $preCC;
 512
 513         if (isNFC_MAYBE($uv)) {
 514             $isMAYBE = 1;
 515         } elsif (isNFC_NO($uv)) {
 516             return '';
 517         }
 518
 519         $preCC = $Combin{ $uvCan[-1] } || 0;
 520     }
 521     return $isMAYBE ? undef : 1;
 522 }
 523
 524 ##
 525 ## split on last starter
 526 ##
 527
 528 sub splitOnLastStarter
 529 {
 530     my $str = pack_U(unpack_U(shift));
 531     if ($str eq '') {
 532         return ('', '');
 533     }
 534
 535     my $ch;
 536     my $unproc = "";
 537     do {
 538         $ch = chop($str);
 539         $unproc = $ch.$unproc;
 540     } while (getCombinClass(unpack 'U', $ch) && $str ne "");
 541     return ($str, $unproc);
 542 }
 543
 544 ##
 545 ## normalize
 546 ##
 547
 548 sub FCD ($) {
 549     my $str = shift;
 550     return checkFCD($str) ? $str : NFD($str);
 551 }
 552
 553 our %formNorm = (
 554     NFC  => \&NFC,      C  => \&NFC,
 555     NFD  => \&NFD,      D  => \&NFD,
 556     NFKC => \&NFKC,     KC => \&NFKC,
 557     NFKD => \&NFKD,     KD => \&NFKD,
 558     FCD  => \&FCD,      FCC => \&FCC,
 559 );
 560
 561 sub normalize($$)
 562 {
 563     my $form = shift;
 564     my $str = shift;
 565     if (exists $formNorm{$form}) {
 566         return $formNorm{$form}->($str);
 567     }
 568     croak($PACKAGE."::normalize: invalid form name: $form");
 569 }
 570
 571 ##
 572 ## partial
 573 ##
 574
 575 sub normalize_partial ($$) {
 576     if (exists $formNorm{$_[0]}) {
 577         my $n = normalize($_[0], $_[1]);
 578         my($p, $u) = splitOnLastStarter($n);
 579         $_[1] = $u;
 580         return $p;
 581     }
 582     croak($PACKAGE."::normalize_partial: invalid form name: $_[0]");
 583 }
 584
 585 sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) }
 586 sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) }
 587 sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) }
 588 sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) }
 589
 590 ##
 591 ## check
 592 ##
 593
 594 our %formCheck = (
 595     NFC  => \&checkNFC,         C  => \&checkNFC,
 596     NFD  => \&checkNFD,         D  => \&checkNFD,
 597     NFKC => \&checkNFKC,        KC => \&checkNFKC,
 598     NFKD => \&checkNFKD,        KD => \&checkNFKD,
 599     FCD  => \&checkFCD,         FCC => \&checkFCC,
 600 );
 601
 602 sub check($$)
 603 {
 604     my $form = shift;
 605     my $str = shift;
 606     if (exists $formCheck{$form}) {
 607         return $formCheck{$form}->($str);
 608     }
 609     croak($PACKAGE."::check: invalid form name: $form");
 610 }
 611
 612 1;
 613 __END__
 614
 615 =head1 NAME
 616
 617 Unicode::Normalize - Unicode Normalization Forms
 618
 619 =head1 SYNOPSIS
 620
 621 (1) using function names exported by default:
 622
 623   use Unicode::Normalize;
 624
 625   $NFD_string  = NFD($string);  # Normalization Form D
 626   $NFC_string  = NFC($string);  # Normalization Form C
 627   $NFKD_string = NFKD($string); # Normalization Form KD
 628   $NFKC_string = NFKC($string); # Normalization Form KC
 629
 630 (2) using function names exported on request:
 631
 632   use Unicode::Normalize 'normalize';
 633
 634   $NFD_string  = normalize('D',  $string);  # Normalization Form D
 635   $NFC_string  = normalize('C',  $string);  # Normalization Form C
 636   $NFKD_string = normalize('KD', $string);  # Normalization Form KD
 637   $NFKC_string = normalize('KC', $string);  # Normalization Form KC
 638
 639 =head1 DESCRIPTION
 640
 641 Parameters:
 642
 643 C<$string> is used as a string under character semantics (see F<perlunicode>).
 644
 645 C<$code_point> should be an unsigned integer representing a Unicode code point.
 646
 647 Note: Do not use a floating point nor a negative sign in C<$code_point>.
 648
 649 =head2 Normalization Forms
 650
 651 =over 4
 652
 653 =item C<$NFD_string = NFD($string)>
 654
 655 It returns the Normalization Form D (formed by canonical decomposition).
 656
 657 =item C<$NFC_string = NFC($string)>
 658
 659 It returns the Normalization Form C (formed by canonical decomposition
 660 followed by canonical composition).
 661
 662 =item C<$NFKD_string = NFKD($string)>
 663
 664 It returns the Normalization Form KD (formed by compatibility decomposition).
 665
 666 =item C<$NFKC_string = NFKC($string)>
 667
 668 It returns the Normalization Form KC (formed by compatibility decomposition
 669 followed by B<canonical> composition).
 670
 671 =item C<$FCD_string = FCD($string)>
 672
 673 If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
 674 it returns the string without modification; otherwise it returns an FCD string.
 675
 676 Note: FCD is not always unique, then plural forms may be equivalent
 677 each other. C<FCD()> will return one of these equivalent forms.
 678
 679 =item C<$FCC_string = FCC($string)>
 680
 681 It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
 682
 683 Note: FCC is unique, as well as four normalization forms (NF*).
 684
 685 =item C<$normalized_string = normalize($form_name, $string)>
 686
 687 It returns the normalization form of C<$form_name>.
 688
 689 As C<$form_name>, one of the following names must be given.
 690
 691   'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 692   'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 693   'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 694   'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 695
 696   'FCD'          for "Fast C or D" Form  (UTN #5)
 697   'FCC'          for "Fast C Contiguous" (UTN #5)
 698
 699 =back
 700
 701 =head2 Decomposition and Composition
 702
 703 =over 4
 704
 705 =item C<$decomposed_string = decompose($string [, $useCompatMapping])>
 706
 707 It returns the concatenation of the decomposition of each character
 708 in the string.
 709
 710 If the second parameter (a boolean) is omitted or false,
 711 the decomposition is canonical decomposition;
 712 if the second parameter (a boolean) is true,
 713 the decomposition is compatibility decomposition.
 714
 715 The string returned is not always in NFD/NFKD. Reordering may be required.
 716
 717     $NFD_string  = reorder(decompose($string));       # eq. to NFD()
 718     $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
 719
 720 =item C<$reordered_string = reorder($string)>
 721
 722 It returns the result of reordering the combining characters
 723 according to Canonical Ordering Behavior.
 724
 725 For example, when you have a list of NFD/NFKD strings,
 726 you can get the concatenated NFD/NFKD string from them, by saying
 727
 728     $concat_NFD  = reorder(join '', @NFD_strings);
 729     $concat_NFKD = reorder(join '', @NFKD_strings);
 730
 731 =item C<$composed_string = compose($string)>
 732
 733 It returns the result of canonical composition
 734 without applying any decomposition.
 735
 736 For example, when you have a NFD/NFKD string,
 737 you can get its NFC/NFKC string, by saying
 738
 739     $NFC_string  = compose($NFD_string);
 740     $NFKC_string = compose($NFKD_string);
 741
 742 =item C<($processed, $unprocessed) = splitOnLastStarter($normalized)>
 743
 744 It returns two strings: the first one, C<$processed>, is a part
 745 before the last starter, and the second one, C<$unprocessed> is
 746 another part after the first part. A starter is a character having
 747 a combining class of zero (see UAX #15).
 748
 749 Note that C<$processed> may be empty (when C<$normalized> contains no
 750 starter or starts with the last starter), and then C<$unprocessed>
 751 should be equal to the entire C<$normalized>.
 752
 753 When you have a C<$normalized> string and an C<$unnormalized> string
 754 following it, a simple concatenation is wrong:
 755
 756     $concat = $normalized . normalize($form, $unnormalized); # wrong!
 757
 758 Instead of it, do like this:
 759
 760     ($processed, $unprocessed) = splitOnLastStarter($normalized);
 761      $concat = $processed . normalize($form, $unprocessed.$unnormalized);
 762
 763 C<splitOnLastStarter()> should be called with a pre-normalized parameter
 764 C<$normalized>, that is in the same form as C<$form> you want.
 765
 766 If you have an array of C<@string> that should be concatenated and then
 767 normalized, you can do like this:
 768
 769     my $result = "";
 770     my $unproc = "";
 771     foreach my $str (@string) {
 772         $unproc .= $str;
 773         my $n = normalize($form, $unproc);
 774         my($p, $u) = splitOnLastStarter($n);
 775         $result .= $p;
 776         $unproc  = $u;
 777     }
 778     $result .= $unproc;
 779     # instead of normalize($form, join('', @string))
 780
 781 =item C<$processed = normalize_partial($form, $unprocessed)>
 782
 783 A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>.
 784 Note that C<$unprocessed> will be modified as a side-effect.
 785
 786 If you have an array of C<@string> that should be concatenated and then
 787 normalized, you can do like this:
 788
 789     my $result = "";
 790     my $unproc = "";
 791     foreach my $str (@string) {
 792         $unproc .= $str;
 793         $result .= normalize_partial($form, $unproc);
 794     }
 795     $result .= $unproc;
 796     # instead of normalize($form, join('', @string))
 797
 798 =item C<$processed = NFD_partial($unprocessed)>
 799
 800 It does like C<normalize_partial('NFD', $unprocessed)>.
 801 Note that C<$unprocessed> will be modified as a side-effect.
 802
 803 =item C<$processed = NFC_partial($unprocessed)>
 804
 805 It does like C<normalize_partial('NFC', $unprocessed)>.
 806 Note that C<$unprocessed> will be modified as a side-effect.
 807
 808 =item C<$processed = NFKD_partial($unprocessed)>
 809
 810 It does like C<normalize_partial('NFKD', $unprocessed)>.
 811 Note that C<$unprocessed> will be modified as a side-effect.
 812
 813 =item C<$processed = NFKC_partial($unprocessed)>
 814
 815 It does like C<normalize_partial('NFKC', $unprocessed)>.
 816 Note that C<$unprocessed> will be modified as a side-effect.
 817
 818 =back
 819
 820 =head2 Quick Check
 821
 822 (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
 823
 824 The following functions check whether the string is in that normalization form.
 825
 826 The result returned will be one of the following:
 827
 828     YES     The string is in that normalization form.
 829     NO      The string is not in that normalization form.
 830     MAYBE   Dubious. Maybe yes, maybe no.
 831
 832 =over 4
 833
 834 =item C<$result = checkNFD($string)>
 835
 836 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 837
 838 =item C<$result = checkNFC($string)>
 839
 840 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 841 C<undef> if C<MAYBE>.
 842
 843 =item C<$result = checkNFKD($string)>
 844
 845 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 846
 847 =item C<$result = checkNFKC($string)>
 848
 849 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 850 C<undef> if C<MAYBE>.
 851
 852 =item C<$result = checkFCD($string)>
 853
 854 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 855
 856 =item C<$result = checkFCC($string)>
 857
 858 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 859 C<undef> if C<MAYBE>.
 860
 861 Note: If a string is not in FCD, it must not be in FCC.
 862 So C<checkFCC($not_FCD_string)> should return C<NO>.
 863
 864 =item C<$result = check($form_name, $string)>
 865
 866 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 867 C<undef> if C<MAYBE>.
 868
 869 As C<$form_name>, one of the following names must be given.
 870
 871   'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 872   'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 873   'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 874   'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 875
 876   'FCD'          for "Fast C or D" Form  (UTN #5)
 877   'FCC'          for "Fast C Contiguous" (UTN #5)
 878
 879 =back
 880
 881 B<Note>
 882
 883 In the cases of NFD, NFKD, and FCD, the answer must be
 884 either C<YES> or C<NO>. The answer C<MAYBE> may be returned
 885 in the cases of NFC, NFKC, and FCC.
 886
 887 A C<MAYBE> string should contain at least one combining character
 888 or the like. For example, C<COMBINING ACUTE ACCENT> has
 889 the MAYBE_NFC/MAYBE_NFKC property.
 890
 891 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
 892 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
 893 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
 894 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
 895 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
 896
 897 If you want to check exactly, compare the string with its NFC/NFKC/FCC.
 898
 899     if ($string eq NFC($string)) {
 900         # $string is exactly normalized in NFC;
 901     } else {
 902         # $string is not normalized in NFC;
 903     }
 904
 905     if ($string eq NFKC($string)) {
 906         # $string is exactly normalized in NFKC;
 907     } else {
 908         # $string is not normalized in NFKC;
 909     }
 910
 911 =head2 Character Data
 912
 913 These functions are interface of character data used internally.
 914 If you want only to get Unicode normalization forms, you don't need
 915 call them yourself.
 916
 917 =over 4
 918
 919 =item C<$canonical_decomposition = getCanon($code_point)>
 920
 921 If the character is canonically decomposable (including Hangul Syllables),
 922 it returns the (full) canonical decomposition as a string.
 923 Otherwise it returns C<undef>.
 924
 925 B<Note:> According to the Unicode standard, the canonical decomposition
 926 of the character that is not canonically decomposable is same as
 927 the character itself.
 928
 929 =item C<$compatibility_decomposition = getCompat($code_point)>
 930
 931 If the character is compatibility decomposable (including Hangul Syllables),
 932 it returns the (full) compatibility decomposition as a string.
 933 Otherwise it returns C<undef>.
 934
 935 B<Note:> According to the Unicode standard, the compatibility decomposition
 936 of the character that is not compatibility decomposable is same as
 937 the character itself.
 938
 939 =item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
 940
 941 If two characters here and next (as code points) are composable
 942 (including Hangul Jamo/Syllables and Composition Exclusions),
 943 it returns the code point of the composite.
 944
 945 If they are not composable, it returns C<undef>.
 946
 947 =item C<$combining_class = getCombinClass($code_point)>
 948
 949 It returns the combining class (as an integer) of the character.
 950
 951 =item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
 952
 953 It returns a boolean whether the character of the specified codepoint
 954 may be composed with the previous one in a certain composition
 955 (including Hangul Compositions, but excluding
 956 Composition Exclusions and Non-Starter Decompositions).
 957
 958 =item C<$is_exclusion = isExclusion($code_point)>
 959
 960 It returns a boolean whether the code point is a composition exclusion.
 961
 962 =item C<$is_singleton = isSingleton($code_point)>
 963
 964 It returns a boolean whether the code point is a singleton
 965
 966 =item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
 967
 968 It returns a boolean whether the code point has Non-Starter Decomposition.
 969
 970 =item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
 971
 972 It returns a boolean of the derived property Comp_Ex
 973 (Full_Composition_Exclusion). This property is generated from
 974 Composition Exclusions + Singletons + Non-Starter Decompositions.
 975
 976 =item C<$NFD_is_NO = isNFD_NO($code_point)>
 977
 978 It returns a boolean of the derived property NFD_NO
 979 (NFD_Quick_Check=No).
 980
 981 =item C<$NFC_is_NO = isNFC_NO($code_point)>
 982
 983 It returns a boolean of the derived property NFC_NO
 984 (NFC_Quick_Check=No).
 985
 986 =item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
 987
 988 It returns a boolean of the derived property NFC_MAYBE
 989 (NFC_Quick_Check=Maybe).
 990
 991 =item C<$NFKD_is_NO = isNFKD_NO($code_point)>
 992
 993 It returns a boolean of the derived property NFKD_NO
 994 (NFKD_Quick_Check=No).
 995
 996 =item C<$NFKC_is_NO = isNFKC_NO($code_point)>
 997
 998 It returns a boolean of the derived property NFKC_NO
 999 (NFKC_Quick_Check=No).
1000
1001 =item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
1002
1003 It returns a boolean of the derived property NFKC_MAYBE
1004 (NFKC_Quick_Check=Maybe).
1005
1006 =back
1007
1008 =head1 EXPORT
1009
1010 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
1011
1012 C<normalize> and other some functions: on request.
1013
1014 =head1 CAVEATS
1015
1016 =over 4
1017
1018 =item Perl's version vs. Unicode version
1019
1020 Since this module refers to perl core's Unicode database in the directory
1021 F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
1022 normalization implemented by this module depends on your perl's version.
1023
1024     perl's version     implemented Unicode version
1025        5.6.1              3.0.1
1026        5.7.2              3.1.0
1027        5.7.3              3.1.1 (normalization is same as 3.1.0)
1028        5.8.0              3.2.0
1029      5.8.1-5.8.3          4.0.0
1030      5.8.4-5.8.6          4.0.1 (normalization is same as 4.0.0)
1031      5.8.7-5.8.8          4.1.0
1032        5.10.0             5.0.0
1033     5.8.9, 5.10.1         5.1.0
1034        5.12.x             5.2.0
1035        5.14.x             6.0.0
1036        5.16.x             6.1.0
1037        5.18.x             6.2.0
1038
1039 =item Correction of decomposition mapping
1040
1041 In older Unicode versions, a small number of characters (all of which are
1042 CJK compatibility ideographs as far as they have been found) may have
1043 an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
1044 Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
1045 nor provide any specific version of normalization. Therefore this module
1046 running on an older perl with an older Unicode database may use
1047 the erroneous decomposition mapping blindly conforming to the Unicode database.
1048
1049 =item Revised definition of canonical composition
1050
1051 In Unicode 4.1.0, the definition D2 of canonical composition (which
1052 affects NFC and NFKC) has been changed (see Public Review Issue #29
1053 and recent UAX #15). This module has used the newer definition
1054 since the version 0.07 (Oct 31, 2001).
1055 This module will not support the normalization according to the older
1056 definition, even if the Unicode version implemented by perl is
1057 lower than 4.1.0.
1058
1059 =back
1060
1061 =head1 AUTHOR
1062
1063 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
1064
1065 Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved.
1066
1067 This module is free software; you can redistribute it
1068 and/or modify it under the same terms as Perl itself.
1069
1070 =head1 SEE ALSO
1071
1072 =over 4
1073
1074 =item http://www.unicode.org/reports/tr15/
1075
1076 Unicode Normalization Forms - UAX #15
1077
1078 =item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
1079
1080 Composition Exclusion Table
1081
1082 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
1083
1084 Derived Normalization Properties
1085
1086 =item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
1087
1088 Normalization Corrections
1089
1090 =item http://www.unicode.org/review/pr-29.html
1091
1092 Public Review Issue #29: Normalization Issue
1093
1094 =item http://www.unicode.org/notes/tn5/
1095
1096 Canonical Equivalence in Applications - UTN #5
1097
1098 =back
1099
1100 =cut