This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
More leniency to the \p and \P: now can have whitespace
[perl5.git] / lib / Unicode / Normalize.pm
CommitLineData
45394607
JH
1package Unicode::Normalize;
2
3use 5.006;
4use strict;
5use warnings;
6use Carp;
7use Lingua::KO::Hangul::Util;
8
9our $VERSION = '0.04';
10our $PACKAGE = __PACKAGE__;
11
12require Exporter;
13our @ISA = qw(Exporter);
14our @EXPORT = qw( NFC NFD NFKC NFKD );
15our @EXPORT_OK = qw( normalize );
16our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
17
18our $Combin = do "unicore/CombiningClass.pl"
19 || do "unicode/CombiningClass.pl"
20 || croak "$PACKAGE: CombiningClass.pl not found";
21
22our $Decomp = do "unicore/Decomposition.pl"
23 || do "unicode/Decomposition.pl"
24 || croak "$PACKAGE: Decomposition.pl not found";
25
26our %Combin; # $codepoint => $number : combination class
27our %Canon; # $codepoint => \@codepoints : canonical decomp.
28our %Compat; # $codepoint => \@codepoints : compat. decomp.
29our %Compos; # $string => $codepoint : composite
30our %Exclus; # $codepoint => 1 : composition exclusions
31
32{
33 my($f, $fh);
34 foreach my $d (@INC) {
35 use File::Spec;
36 $f = File::Spec->catfile($d, "unicore", "CompExcl.txt");
37 last if open($fh, $f);
38 $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
39 last if open($fh, $f);
40 $f = undef;
41 }
42 croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f;
43 while(<$fh>){
44 next if /^#/ or /^$/;
45 s/#.*//;
46 $Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/;
47 }
48 close $fh;
49}
50
51while($Combin =~ /(.+)/g)
52{
53 my @tab = split /\t/, $1;
54 my $ini = hex $tab[0];
55 if($tab[1] eq '')
56 {
57 $Combin{ $ini } = $tab[2];
58 }
59 else
60 {
61 $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
62 }
63}
64
65while($Decomp =~ /(.+)/g)
66{
67 my @tab = split /\t/, $1;
68 my $compat = $tab[2] =~ s/<[^>]+>//;
69 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
70 my $com = pack('U*', @$dec); # composable sequence
71 my $ini = hex($tab[0]);
72 if($tab[1] eq '')
73 {
74 $Compat{ $ini } = $dec;
75 if(! $compat){
76 $Canon{ $ini } = $dec;
77 $Compos{ $com } = $ini;
78 }
79 }
80 else
81 {
82 foreach my $u ($ini .. hex($tab[1])){
83 $Compat{ $u } = $dec;
84 if(! $compat){
85 $Canon{ $u } = $dec;
86 $Compos{ $com } = $ini;
87 }
88 }
89 }
90}
91
92foreach my $key (keys %Canon) # exhaustive decomposition
93{
94 $Canon{$key} = [ getCanonList($key) ];
95}
96
97foreach my $key (keys %Compat) # exhaustive decomposition
98{
99 $Compat{$key} = [ getCompatList($key) ];
100}
101
102sub getCanonList
103{
104 my @src = @_;
105 my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src;
106 join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
107 # condition @src == @dec is not ok.
108}
109
110sub getCompatList
111{
112 my @src = @_;
113 my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src;
114 join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
115 # condition @src == @dec is not ok.
116}
117
118sub NFD($){ _decompose(shift, 0) }
119
120sub NFKD($){ _decompose(shift, 1) }
121
122sub NFC($){ _compose(NFD(shift)) }
123
124sub NFKC($){ _compose(NFKD(shift)) }
125
126sub normalize($$)
127{
128 my($form,$str) = @_;
129 $form eq 'D' || $form eq 'NFD' ? NFD($str) :
130 $form eq 'C' || $form eq 'NFC' ? NFC($str) :
131 $form eq 'KD' || $form eq 'NFKD' ? NFKD($str) :
132 $form eq 'KC' || $form eq 'NFKC' ? NFKC($str) :
133 croak $PACKAGE."::normalize: invalid form name: $form";
134}
135
136
137##
138## string _decompose(string, compat?)
139##
140sub _decompose
141{
142 my $str = $_[0];
143 my $hash = $_[1] ? \%Compat : \%Canon;
144 my @ret;
145 my $retstr="";
146 foreach my $u (unpack 'U*', $str){
147 push @ret,
148 $hash->{ $u } ? @{ $hash->{ $u } } :
149 _isHangul($u) ? decomposeHangul($u) : $u;
150 }
151 for(my $i=0; $i<@ret;){
152 $retstr .= pack('U', $ret[$i++]), next
153 unless $Combin{ $ret[$i] } && $i+1 < @ret && $Combin{ $ret[$i+1] };
154 my @tmp;
155 push(@tmp, $ret[$i++]) while $i < @ret && $Combin{ $ret[$i] };
156 $retstr .= pack 'U*', @tmp[
157 sort {
158 $Combin{ $tmp[$a] } <=> $Combin{ $tmp[$b] } || $a <=> $b
159 } 0 .. @tmp - 1,
160 ];
161 }
162 $retstr;
163}
164
165##
166## string _compose(string)
167##
168## S : starter; NS : not starter;
169##
170## composable sequence begins at S.
171## S + S or (S + S) + S may be composed.
172## NS + NS must not be composed.
173##
174sub _compose
175{
176 my @src = unpack('U*', composeHangul shift); # get codepoints
177 for(my $s = 0; $s+1 < @src; $s++){
178 next unless defined $src[$s] && ! $Combin{ $src[$s] }; # S only
179 my($c, $blocked);
180 for(my $j = $s+1; $j < @src && !$blocked; $j++){
181 $blocked = 1 if ! $Combin{ $src[$j] };
182
183 next if $j != $s + 1 && defined $src[$j-1]
184 && $Combin{ $src[$j-1] } && $Combin{ $src[$j] }
185 && $Combin{ $src[$j-1] } == $Combin{ $src[$j] };
186
187 if( # $c != 0, maybe.
188 $c = $Compos{pack('U*', @src[$s,$j])} and ! $Exclus{$c}
189 )
190 {
191 $src[$s] = $c; $src[$j] = undef; $blocked = 0;
192 }
193 }
194 }
195 pack 'U*', grep defined(), @src;
196}
197
198##
199## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
200##
201sub _getHexArray
202{
203 my $str = shift;
204 map hex(), $str =~ /([0-9A-Fa-f]+)/g;
205}
206
207##
208## Hangul Syllables
209##
210sub _isHangul
211{
212 my $code = shift;
213 return 0xAC00 <= $code && $code <= 0xD7A3;
214}
215
216##
217## for Debug
218##
219sub _getCombin { wantarray ? %Combin : \%Combin }
220sub _getCanon { wantarray ? %Canon : \%Canon }
221sub _getCompat { wantarray ? %Compat : \%Compat }
222sub _getCompos { wantarray ? %Compos : \%Compos }
223sub _getExclus { wantarray ? %Exclus : \%Exclus }
2241;
225__END__
226
227=head1 NAME
228
229Unicode::Normalize - normalized forms of Unicode text
230
231=head1 SYNOPSIS
232
233 use Unicode::Normalize;
234
235 $string_NFD = NFD($raw_string); # Normalization Form D
236 $string_NFC = NFC($raw_string); # Normalization Form C
237 $string_NFKD = NFKD($raw_string); # Normalization Form KD
238 $string_NFKC = NFKC($raw_string); # Normalization Form KC
239
240 or
241
242 use Unicode::Normalize 'normalize';
243
244 $string_NFD = normalize('D', $raw_string); # Normalization Form D
245 $string_NFC = normalize('C', $raw_string); # Normalization Form C
246 $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
247 $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
248
249=head1 DESCRIPTION
250
251=over 4
252
253=item C<$string_NFD = NFD($raw_string)>
254
255returns the Normalization Form D (formed by canonical decomposition).
256
257
258=item C<$string_NFC = NFC($raw_string)>
259
260returns the Normalization Form C (formed by canonical decomposition
261followed by canonical composition).
262
263=item C<$string_NFKD = NFKD($raw_string)>
264
265returns the Normalization Form KD (formed by compatibility decomposition).
266
267=item C<$string_NFKC = NFKC($raw_string)>
268
269returns the Normalization Form KC (formed by compatibility decomposition
270followed by B<canonical> composition).
271
272=item C<$normalized_string = normalize($form_name, $raw_string)>
273
274As C<$form_name>, one of the following names must be given.
275
276 'C' or 'NFC' for Normalization Form C
277 'D' or 'NFD' for Normalization Form D
278 'KC' or 'NFKC' for Normalization Form KC
279 'KD' or 'NFKD' for Normalization Form KD
280
281=back
282
283=head2 EXPORT
284
285C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
286
287C<normalize>: on request.
288
289=head1 AUTHOR
290
291SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
292
293 http://homepage1.nifty.com/nomenclator/perl/
294
295 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
296
297 This program is free software; you can redistribute it and/or
298 modify it under the same terms as Perl itself.
299
300=head1 SEE ALSO
301
302=over 4
303
304=item L<Lingua::KO::Hangul::Util>
305
306utility functions for Hangul Syllables
307
308=item http://www.unicode.org/unicode/reports/tr15/
309
310Unicode Normalization Forms - UAX #15
311
312=back
313
314=cut