1 package Unicode::Normalize;
4 unless ('A' eq pack('U', 0x41)) {
5 die "Unicode::Normalize cannot stringify a Unicode code point\n";
7 unless (0x41 == unpack('U', 'A')) {
8 die "Unicode::Normalize cannot get Unicode code point\n";
19 our $VERSION = '1.18';
20 our $PACKAGE = __PACKAGE__;
22 our @EXPORT = qw( NFC NFD NFKC NFKD );
24 normalize decompose reorder compose
25 checkNFD checkNFKD checkNFC checkNFKC check
26 getCanon getCompat getComposite getCombinClass
27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
29 FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter
30 normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial
33 all => [ @EXPORT, @EXPORT_OK ],
34 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
35 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
36 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
40 ## utilities for tests
44 return pack('U*', @_);
48 return unpack('U*', shift(@_).pack('U*'));
53 our @ISA = qw(Exporter);
56 our %Combin; # $codepoint => $number : combination class
57 our %Canon; # $codepoint => \@codepoints : canonical decomp.
58 our %Compat; # $codepoint => \@codepoints : compat. decomp.
59 our %Compos; # $1st,$2nd => $codepoint : composite
60 our %Exclus; # $codepoint => 1 : composition exclusions
61 our %Single; # $codepoint => 1 : singletons
62 our %NonStD; # $codepoint => 1 : non-starter decompositions
63 our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
65 # from core Unicode database
66 our $Combin = do "unicore/CombiningClass.pl"
67 || do "unicode/CombiningClass.pl"
68 || croak "$PACKAGE: CombiningClass.pl not found";
69 our $Decomp = do "unicore/Decomposition.pl"
70 || do "unicode/Decomposition.pl"
71 || croak "$PACKAGE: Decomposition.pl not found";
73 # CompositionExclusions.txt since Unicode 3.2.0
75 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
76 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
77 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
78 FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
79 FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
80 FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
81 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
84 # definition of Hangul constants
85 use constant SBase => 0xAC00;
86 use constant SFinal => 0xD7A3; # SBase -1 + SCount
87 use constant SCount => 11172; # LCount * NCount
88 use constant NCount => 588; # VCount * TCount
89 use constant LBase => 0x1100;
90 use constant LFinal => 0x1112;
91 use constant LCount => 19;
92 use constant VBase => 0x1161;
93 use constant VFinal => 0x1175;
94 use constant VCount => 21;
95 use constant TBase => 0x11A7;
96 use constant TFinal => 0x11C2;
97 use constant TCount => 28;
100 my $sindex = $_[0] - SBase;
101 my $lindex = int( $sindex / NCount);
102 my $vindex = int(($sindex % NCount) / TCount);
103 my $tindex = $sindex % TCount;
107 $tindex ? (TBase + $tindex) : (),
109 return wantarray ? @ret : pack_U(@ret);
112 ########## getting full decomposition ##########
114 ## converts string "hhhh hhhh hhhh" to a numeric list
115 ## (hex digits separated by spaces)
116 sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
118 while ($Combin =~ /(.+)/g) {
119 my @tab = split /\t/, $1;
120 my $ini = hex $tab[0];
122 $Combin{$ini} = $tab[2];
124 $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
128 while ($Decomp =~ /(.+)/g) {
129 my @tab = split /\t/, $1;
130 my $compat = $tab[2] =~ s/<[^>]+>//;
131 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
132 my $ini = hex($tab[0]); # initial decomposable character
133 my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
134 # ($ini .. $end) is the range of decomposable characters.
136 foreach my $u ($ini .. $end) {
138 $Canon{$u} = $dec if ! $compat;
142 for my $s (@CompEx) {
144 next if !$Canon{$u}; # not assigned
145 next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
149 foreach my $u (keys %Canon) {
150 my $dec = $Canon{$u};
153 if ($Combin{ $dec->[0] }) {
156 $Compos{ $dec->[0] }{ $dec->[1] } = $u;
157 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
159 } elsif (@$dec == 1) {
162 my $h = sprintf '%04X', $u;
163 croak("Weird Canonical Decomposition of U+$h");
167 # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
168 foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
175 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
176 : $Canon{$_} ? @{ $Canon{$_} } : $_
178 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
179 # condition @src == @dec is not ok.
185 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
186 : $Compat{$_} ? @{ $Compat{$_} } : $_
188 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
189 # condition @src == @dec is not ok.
192 # exhaustive decomposition
193 foreach my $key (keys %Canon) {
194 $Canon{$key} = [ getCanonList($key) ];
197 # exhaustive decomposition
198 foreach my $key (keys %Compat) {
199 $Compat{$key} = [ getCompatList($key) ];
202 sub getHangulComposite ($$) {
203 if ((LBase <= $_[0] && $_[0] <= LFinal)
204 && (VBase <= $_[1] && $_[1] <= VFinal)) {
205 my $lindex = $_[0] - LBase;
206 my $vindex = $_[1] - VBase;
207 return (SBase + ($lindex * VCount + $vindex) * TCount);
209 if ((SBase <= $_[0] && $_[0] <= SFinal && (($_[0] - SBase ) % TCount) == 0)
210 && (TBase < $_[1] && $_[1] <= TFinal)) {
211 return($_[0] + $_[1] - TBase);
218 sub getCombinClass ($) {
220 return $Combin{$uv} || 0;
225 return exists $Canon{$uv}
226 ? pack_U(@{ $Canon{$uv} })
227 : (SBase <= $uv && $uv <= SFinal)
228 ? scalar decomposeHangul($uv)
234 return exists $Compat{$uv}
235 ? pack_U(@{ $Compat{$uv} })
236 : (SBase <= $uv && $uv <= SFinal)
237 ? scalar decomposeHangul($uv)
241 sub getComposite ($$) {
244 my $hangul = getHangulComposite($uv1, $uv2);
245 return $hangul if $hangul;
246 return $Compos{ $uv1 } && $Compos{ $uv1 }{ $uv2 };
249 sub isExclusion ($) {
251 return exists $Exclus{$uv};
254 sub isSingleton ($) {
256 return exists $Single{$uv};
259 sub isNonStDecomp($) {
261 return exists $NonStD{$uv};
266 return exists $Comp2nd{$uv};
269 sub isNFC_MAYBE ($) {
271 return exists $Comp2nd{$uv};
274 sub isNFKC_MAYBE($) {
276 return exists $Comp2nd{$uv};
281 return exists $Canon {$uv} || (SBase <= $uv && $uv <= SFinal);
286 return exists $Compat{$uv} || (SBase <= $uv && $uv <= SFinal);
291 return exists $Exclus{$uv} || exists $Single{$uv} || exists $NonStD{$uv};
296 return exists $Exclus{$uv} || exists $Single{$uv} || exists $NonStD{$uv};
301 return 1 if $Exclus{$uv} || $Single{$uv} || $NonStD{$uv};
302 return '' if (SBase <= $uv && $uv <= SFinal) || !exists $Compat{$uv};
303 return 1 if ! exists $Canon{$uv};
304 return pack('N*', @{ $Canon{$uv} }) ne pack('N*', @{ $Compat{$uv} });
308 ## string decompose(string, compat?)
312 my $hash = $_[1] ? \%Compat : \%Canon;
314 $hash->{ $_ } ? @{ $hash->{ $_ } } :
315 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) : $_
320 ## string reorder(string)
324 my @src = unpack_U($_[0]);
326 for (my $i=0; $i < @src;) {
327 $i++, next if ! $Combin{ $src[$i] };
330 $i++ while $i < @src && $Combin{ $src[$i] };
333 $Combin{ $src[$a] } <=> $Combin{ $src[$b] } || $a <=> $b
336 @src[ $ini .. $i - 1 ] = @src[ @tmp ];
343 ## string compose(string)
345 ## S : starter; NS : not starter;
347 ## composable sequence begins at S.
348 ## S + S or (S + S) + S may be composed.
349 ## NS + NS must not be composed.
353 my @src = unpack_U($_[0]);
355 for (my $s = 0; $s+1 < @src; $s++) {
356 next unless defined $src[$s] && ! $Combin{ $src[$s] };
357 # S only; removed or combining are skipped as a starter.
359 my($c, $blocked, $uncomposed_cc);
360 for (my $j = $s+1; $j < @src && !$blocked; $j++) {
361 ($Combin{ $src[$j] } ? $uncomposed_cc : $blocked) = 1;
363 # S + C + S => S-S + C would be blocked.
364 next if $blocked && $uncomposed_cc;
366 # blocked by same CC (and higher CC: revised D2)
367 next if defined $src[$j-1] && $Combin{ $src[$j-1] }
368 && $Combin{ $src[$j-1] } >= $Combin{ $src[$j] };
370 $c = getComposite($src[$s], $src[$j]);
372 # no composite or is exclusion
373 next if !$c || $Exclus{$c};
375 # replace by composite
376 $src[$s] = $c; $src[$j] = undef;
377 if ($blocked) { $blocked = 0 } else { -- $uncomposed_cc }
380 return pack_U(grep defined, @src);
385 ## string composeContiguous(string)
387 sub composeContiguous ($)
389 my @src = unpack_U($_[0]);
391 for (my $s = 0; $s+1 < @src; $s++) {
392 next unless defined $src[$s] && ! $Combin{ $src[$s] };
393 # S only; removed or combining are skipped as a starter.
395 for (my $j = $s+1; $j < @src; $j++) {
396 my $c = getComposite($src[$s], $src[$j]);
398 # no composite or is exclusion
399 last if !$c || $Exclus{$c};
401 # replace by composite
402 $src[$s] = $c; $src[$j] = undef;
405 return pack_U(grep defined, @src);
410 ## normalization forms
413 use constant COMPAT => 1;
415 sub NFD ($) { reorder(decompose($_[0])) }
416 sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
417 sub NFC ($) { compose(reorder(decompose($_[0]))) }
418 sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
419 sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) }
429 for my $uv (unpack_U($_[0])) {
430 $curCC = $Combin{ $uv } || 0;
431 return '' if $preCC > $curCC && $curCC != 0;
432 return '' if exists $Canon{$uv} || (SBase <= $uv && $uv <= SFinal);
442 for my $uv (unpack_U($_[0])) {
443 $curCC = $Combin{ $uv } || 0;
444 return '' if $preCC > $curCC && $curCC != 0;
445 return '' if exists $Compat{$uv} || (SBase <= $uv && $uv <= SFinal);
454 my($curCC, $isMAYBE);
455 for my $uv (unpack_U($_[0])) {
456 $curCC = $Combin{ $uv } || 0;
457 return '' if $preCC > $curCC && $curCC != 0;
459 if (isNFC_MAYBE($uv)) {
461 } elsif (isNFC_NO($uv)) {
466 return $isMAYBE ? undef : 1;
472 my($curCC, $isMAYBE);
473 for my $uv (unpack_U($_[0])) {
474 $curCC = $Combin{ $uv } || 0;
475 return '' if $preCC > $curCC && $curCC != 0;
477 if (isNFKC_MAYBE($uv)) {
479 } elsif (isNFKC_NO($uv)) {
484 return $isMAYBE ? undef : 1;
491 for my $uv (unpack_U($_[0])) {
492 # Hangul syllable need not decomposed since cc[any Jamo] == 0;
493 my @uvCan = exists $Canon{$uv} ? @{ $Canon{$uv} } : ($uv);
495 $curCC = $Combin{ $uvCan[0] } || 0;
496 return '' if $curCC != 0 && $curCC < $preCC;
497 $preCC = $Combin{ $uvCan[-1] } || 0;
505 my($curCC, $isMAYBE);
506 for my $uv (unpack_U($_[0])) {
507 # Hangul syllable need not decomposed since cc[any Jamo] == 0;
508 my @uvCan = exists $Canon{$uv} ? @{ $Canon{$uv} } : ($uv);
510 $curCC = $Combin{ $uvCan[0] } || 0;
511 return '' if $curCC != 0 && $curCC < $preCC;
513 if (isNFC_MAYBE($uv)) {
515 } elsif (isNFC_NO($uv)) {
519 $preCC = $Combin{ $uvCan[-1] } || 0;
521 return $isMAYBE ? undef : 1;
525 ## split on last starter
528 sub splitOnLastStarter
530 my $str = pack_U(unpack_U(shift));
539 $unproc = $ch.$unproc;
540 } while (getCombinClass(unpack 'U', $ch) && $str ne "");
541 return ($str, $unproc);
550 return checkFCD($str) ? $str : NFD($str);
554 NFC => \&NFC, C => \&NFC,
555 NFD => \&NFD, D => \&NFD,
556 NFKC => \&NFKC, KC => \&NFKC,
557 NFKD => \&NFKD, KD => \&NFKD,
558 FCD => \&FCD, FCC => \&FCC,
565 if (exists $formNorm{$form}) {
566 return $formNorm{$form}->($str);
568 croak($PACKAGE."::normalize: invalid form name: $form");
575 sub normalize_partial ($$) {
576 if (exists $formNorm{$_[0]}) {
577 my $n = normalize($_[0], $_[1]);
578 my($p, $u) = splitOnLastStarter($n);
582 croak($PACKAGE."::normalize_partial: invalid form name: $_[0]");
585 sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) }
586 sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) }
587 sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) }
588 sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) }
595 NFC => \&checkNFC, C => \&checkNFC,
596 NFD => \&checkNFD, D => \&checkNFD,
597 NFKC => \&checkNFKC, KC => \&checkNFKC,
598 NFKD => \&checkNFKD, KD => \&checkNFKD,
599 FCD => \&checkFCD, FCC => \&checkFCC,
606 if (exists $formCheck{$form}) {
607 return $formCheck{$form}->($str);
609 croak($PACKAGE."::check: invalid form name: $form");
617 Unicode::Normalize - Unicode Normalization Forms
621 (1) using function names exported by default:
623 use Unicode::Normalize;
625 $NFD_string = NFD($string); # Normalization Form D
626 $NFC_string = NFC($string); # Normalization Form C
627 $NFKD_string = NFKD($string); # Normalization Form KD
628 $NFKC_string = NFKC($string); # Normalization Form KC
630 (2) using function names exported on request:
632 use Unicode::Normalize 'normalize';
634 $NFD_string = normalize('D', $string); # Normalization Form D
635 $NFC_string = normalize('C', $string); # Normalization Form C
636 $NFKD_string = normalize('KD', $string); # Normalization Form KD
637 $NFKC_string = normalize('KC', $string); # Normalization Form KC
643 C<$string> is used as a string under character semantics (see F<perlunicode>).
645 C<$code_point> should be an unsigned integer representing a Unicode code point.
647 Note: Do not use a floating point nor a negative sign in C<$code_point>.
649 =head2 Normalization Forms
653 =item C<$NFD_string = NFD($string)>
655 It returns the Normalization Form D (formed by canonical decomposition).
657 =item C<$NFC_string = NFC($string)>
659 It returns the Normalization Form C (formed by canonical decomposition
660 followed by canonical composition).
662 =item C<$NFKD_string = NFKD($string)>
664 It returns the Normalization Form KD (formed by compatibility decomposition).
666 =item C<$NFKC_string = NFKC($string)>
668 It returns the Normalization Form KC (formed by compatibility decomposition
669 followed by B<canonical> composition).
671 =item C<$FCD_string = FCD($string)>
673 If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
674 it returns the string without modification; otherwise it returns an FCD string.
676 Note: FCD is not always unique, then plural forms may be equivalent
677 each other. C<FCD()> will return one of these equivalent forms.
679 =item C<$FCC_string = FCC($string)>
681 It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
683 Note: FCC is unique, as well as four normalization forms (NF*).
685 =item C<$normalized_string = normalize($form_name, $string)>
687 It returns the normalization form of C<$form_name>.
689 As C<$form_name>, one of the following names must be given.
691 'C' or 'NFC' for Normalization Form C (UAX #15)
692 'D' or 'NFD' for Normalization Form D (UAX #15)
693 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
694 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
696 'FCD' for "Fast C or D" Form (UTN #5)
697 'FCC' for "Fast C Contiguous" (UTN #5)
701 =head2 Decomposition and Composition
705 =item C<$decomposed_string = decompose($string [, $useCompatMapping])>
707 It returns the concatenation of the decomposition of each character
710 If the second parameter (a boolean) is omitted or false,
711 the decomposition is canonical decomposition;
712 if the second parameter (a boolean) is true,
713 the decomposition is compatibility decomposition.
715 The string returned is not always in NFD/NFKD. Reordering may be required.
717 $NFD_string = reorder(decompose($string)); # eq. to NFD()
718 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
720 =item C<$reordered_string = reorder($string)>
722 It returns the result of reordering the combining characters
723 according to Canonical Ordering Behavior.
725 For example, when you have a list of NFD/NFKD strings,
726 you can get the concatenated NFD/NFKD string from them, by saying
728 $concat_NFD = reorder(join '', @NFD_strings);
729 $concat_NFKD = reorder(join '', @NFKD_strings);
731 =item C<$composed_string = compose($string)>
733 It returns the result of canonical composition
734 without applying any decomposition.
736 For example, when you have a NFD/NFKD string,
737 you can get its NFC/NFKC string, by saying
739 $NFC_string = compose($NFD_string);
740 $NFKC_string = compose($NFKD_string);
742 =item C<($processed, $unprocessed) = splitOnLastStarter($normalized)>
744 It returns two strings: the first one, C<$processed>, is a part
745 before the last starter, and the second one, C<$unprocessed> is
746 another part after the first part. A starter is a character having
747 a combining class of zero (see UAX #15).
749 Note that C<$processed> may be empty (when C<$normalized> contains no
750 starter or starts with the last starter), and then C<$unprocessed>
751 should be equal to the entire C<$normalized>.
753 When you have a C<$normalized> string and an C<$unnormalized> string
754 following it, a simple concatenation is wrong:
756 $concat = $normalized . normalize($form, $unnormalized); # wrong!
758 Instead of it, do like this:
760 ($processed, $unprocessed) = splitOnLastStarter($normalized);
761 $concat = $processed . normalize($form, $unprocessed.$unnormalized);
763 C<splitOnLastStarter()> should be called with a pre-normalized parameter
764 C<$normalized>, that is in the same form as C<$form> you want.
766 If you have an array of C<@string> that should be concatenated and then
767 normalized, you can do like this:
771 foreach my $str (@string) {
773 my $n = normalize($form, $unproc);
774 my($p, $u) = splitOnLastStarter($n);
779 # instead of normalize($form, join('', @string))
781 =item C<$processed = normalize_partial($form, $unprocessed)>
783 A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>.
784 Note that C<$unprocessed> will be modified as a side-effect.
786 If you have an array of C<@string> that should be concatenated and then
787 normalized, you can do like this:
791 foreach my $str (@string) {
793 $result .= normalize_partial($form, $unproc);
796 # instead of normalize($form, join('', @string))
798 =item C<$processed = NFD_partial($unprocessed)>
800 It does like C<normalize_partial('NFD', $unprocessed)>.
801 Note that C<$unprocessed> will be modified as a side-effect.
803 =item C<$processed = NFC_partial($unprocessed)>
805 It does like C<normalize_partial('NFC', $unprocessed)>.
806 Note that C<$unprocessed> will be modified as a side-effect.
808 =item C<$processed = NFKD_partial($unprocessed)>
810 It does like C<normalize_partial('NFKD', $unprocessed)>.
811 Note that C<$unprocessed> will be modified as a side-effect.
813 =item C<$processed = NFKC_partial($unprocessed)>
815 It does like C<normalize_partial('NFKC', $unprocessed)>.
816 Note that C<$unprocessed> will be modified as a side-effect.
822 (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
824 The following functions check whether the string is in that normalization form.
826 The result returned will be one of the following:
828 YES The string is in that normalization form.
829 NO The string is not in that normalization form.
830 MAYBE Dubious. Maybe yes, maybe no.
834 =item C<$result = checkNFD($string)>
836 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
838 =item C<$result = checkNFC($string)>
840 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
841 C<undef> if C<MAYBE>.
843 =item C<$result = checkNFKD($string)>
845 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
847 =item C<$result = checkNFKC($string)>
849 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
850 C<undef> if C<MAYBE>.
852 =item C<$result = checkFCD($string)>
854 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
856 =item C<$result = checkFCC($string)>
858 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
859 C<undef> if C<MAYBE>.
861 Note: If a string is not in FCD, it must not be in FCC.
862 So C<checkFCC($not_FCD_string)> should return C<NO>.
864 =item C<$result = check($form_name, $string)>
866 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
867 C<undef> if C<MAYBE>.
869 As C<$form_name>, one of the following names must be given.
871 'C' or 'NFC' for Normalization Form C (UAX #15)
872 'D' or 'NFD' for Normalization Form D (UAX #15)
873 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
874 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
876 'FCD' for "Fast C or D" Form (UTN #5)
877 'FCC' for "Fast C Contiguous" (UTN #5)
883 In the cases of NFD, NFKD, and FCD, the answer must be
884 either C<YES> or C<NO>. The answer C<MAYBE> may be returned
885 in the cases of NFC, NFKC, and FCC.
887 A C<MAYBE> string should contain at least one combining character
888 or the like. For example, C<COMBINING ACUTE ACCENT> has
889 the MAYBE_NFC/MAYBE_NFKC property.
891 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
892 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
893 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
894 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
895 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
897 If you want to check exactly, compare the string with its NFC/NFKC/FCC.
899 if ($string eq NFC($string)) {
900 # $string is exactly normalized in NFC;
902 # $string is not normalized in NFC;
905 if ($string eq NFKC($string)) {
906 # $string is exactly normalized in NFKC;
908 # $string is not normalized in NFKC;
911 =head2 Character Data
913 These functions are interface of character data used internally.
914 If you want only to get Unicode normalization forms, you don't need
919 =item C<$canonical_decomposition = getCanon($code_point)>
921 If the character is canonically decomposable (including Hangul Syllables),
922 it returns the (full) canonical decomposition as a string.
923 Otherwise it returns C<undef>.
925 B<Note:> According to the Unicode standard, the canonical decomposition
926 of the character that is not canonically decomposable is same as
927 the character itself.
929 =item C<$compatibility_decomposition = getCompat($code_point)>
931 If the character is compatibility decomposable (including Hangul Syllables),
932 it returns the (full) compatibility decomposition as a string.
933 Otherwise it returns C<undef>.
935 B<Note:> According to the Unicode standard, the compatibility decomposition
936 of the character that is not compatibility decomposable is same as
937 the character itself.
939 =item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
941 If two characters here and next (as code points) are composable
942 (including Hangul Jamo/Syllables and Composition Exclusions),
943 it returns the code point of the composite.
945 If they are not composable, it returns C<undef>.
947 =item C<$combining_class = getCombinClass($code_point)>
949 It returns the combining class (as an integer) of the character.
951 =item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
953 It returns a boolean whether the character of the specified codepoint
954 may be composed with the previous one in a certain composition
955 (including Hangul Compositions, but excluding
956 Composition Exclusions and Non-Starter Decompositions).
958 =item C<$is_exclusion = isExclusion($code_point)>
960 It returns a boolean whether the code point is a composition exclusion.
962 =item C<$is_singleton = isSingleton($code_point)>
964 It returns a boolean whether the code point is a singleton
966 =item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
968 It returns a boolean whether the code point has Non-Starter Decomposition.
970 =item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
972 It returns a boolean of the derived property Comp_Ex
973 (Full_Composition_Exclusion). This property is generated from
974 Composition Exclusions + Singletons + Non-Starter Decompositions.
976 =item C<$NFD_is_NO = isNFD_NO($code_point)>
978 It returns a boolean of the derived property NFD_NO
979 (NFD_Quick_Check=No).
981 =item C<$NFC_is_NO = isNFC_NO($code_point)>
983 It returns a boolean of the derived property NFC_NO
984 (NFC_Quick_Check=No).
986 =item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
988 It returns a boolean of the derived property NFC_MAYBE
989 (NFC_Quick_Check=Maybe).
991 =item C<$NFKD_is_NO = isNFKD_NO($code_point)>
993 It returns a boolean of the derived property NFKD_NO
994 (NFKD_Quick_Check=No).
996 =item C<$NFKC_is_NO = isNFKC_NO($code_point)>
998 It returns a boolean of the derived property NFKC_NO
999 (NFKC_Quick_Check=No).
1001 =item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
1003 It returns a boolean of the derived property NFKC_MAYBE
1004 (NFKC_Quick_Check=Maybe).
1010 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
1012 C<normalize> and other some functions: on request.
1018 =item Perl's version vs. Unicode version
1020 Since this module refers to perl core's Unicode database in the directory
1021 F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
1022 normalization implemented by this module depends on your perl's version.
1024 perl's version implemented Unicode version
1027 5.7.3 3.1.1 (normalization is same as 3.1.0)
1030 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0)
1039 =item Correction of decomposition mapping
1041 In older Unicode versions, a small number of characters (all of which are
1042 CJK compatibility ideographs as far as they have been found) may have
1043 an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
1044 Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
1045 nor provide any specific version of normalization. Therefore this module
1046 running on an older perl with an older Unicode database may use
1047 the erroneous decomposition mapping blindly conforming to the Unicode database.
1049 =item Revised definition of canonical composition
1051 In Unicode 4.1.0, the definition D2 of canonical composition (which
1052 affects NFC and NFKC) has been changed (see Public Review Issue #29
1053 and recent UAX #15). This module has used the newer definition
1054 since the version 0.07 (Oct 31, 2001).
1055 This module will not support the normalization according to the older
1056 definition, even if the Unicode version implemented by perl is
1063 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
1065 Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved.
1067 This module is free software; you can redistribute it
1068 and/or modify it under the same terms as Perl itself.
1074 =item http://www.unicode.org/reports/tr15/
1076 Unicode Normalization Forms - UAX #15
1078 =item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
1080 Composition Exclusion Table
1082 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
1084 Derived Normalization Properties
1086 =item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
1088 Normalization Corrections
1090 =item http://www.unicode.org/review/pr-29.html
1092 Public Review Issue #29: Normalization Issue
1094 =item http://www.unicode.org/notes/tn5/
1096 Canonical Equivalence in Applications - UTN #5