Commit | Line | Data |
---|---|---|
45394607 JH |
1 | package Unicode::Normalize; |
2 | ||
3 | use 5.006; | |
4 | use strict; | |
5 | use warnings; | |
6 | use Carp; | |
7 | use Lingua::KO::Hangul::Util; | |
8 | ||
9 | our $VERSION = '0.04'; | |
10 | our $PACKAGE = __PACKAGE__; | |
11 | ||
12 | require Exporter; | |
13 | our @ISA = qw(Exporter); | |
14 | our @EXPORT = qw( NFC NFD NFKC NFKD ); | |
15 | our @EXPORT_OK = qw( normalize ); | |
16 | our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] ); | |
17 | ||
18 | our $Combin = do "unicore/CombiningClass.pl" | |
19 | || do "unicode/CombiningClass.pl" | |
20 | || croak "$PACKAGE: CombiningClass.pl not found"; | |
21 | ||
22 | our $Decomp = do "unicore/Decomposition.pl" | |
23 | || do "unicode/Decomposition.pl" | |
24 | || croak "$PACKAGE: Decomposition.pl not found"; | |
25 | ||
26 | our %Combin; # $codepoint => $number : combination class | |
27 | our %Canon; # $codepoint => \@codepoints : canonical decomp. | |
28 | our %Compat; # $codepoint => \@codepoints : compat. decomp. | |
29 | our %Compos; # $string => $codepoint : composite | |
30 | our %Exclus; # $codepoint => 1 : composition exclusions | |
31 | ||
32 | { | |
33 | my($f, $fh); | |
34 | foreach my $d (@INC) { | |
35 | use File::Spec; | |
36 | $f = File::Spec->catfile($d, "unicore", "CompExcl.txt"); | |
37 | last if open($fh, $f); | |
38 | $f = File::Spec->catfile($d, "unicode", "CompExcl.txt"); | |
39 | last if open($fh, $f); | |
40 | $f = undef; | |
41 | } | |
42 | croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f; | |
43 | while(<$fh>){ | |
44 | next if /^#/ or /^$/; | |
45 | s/#.*//; | |
46 | $Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/; | |
47 | } | |
48 | close $fh; | |
49 | } | |
50 | ||
51 | while($Combin =~ /(.+)/g) | |
52 | { | |
53 | my @tab = split /\t/, $1; | |
54 | my $ini = hex $tab[0]; | |
55 | if($tab[1] eq '') | |
56 | { | |
57 | $Combin{ $ini } = $tab[2]; | |
58 | } | |
59 | else | |
60 | { | |
61 | $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]); | |
62 | } | |
63 | } | |
64 | ||
65 | while($Decomp =~ /(.+)/g) | |
66 | { | |
67 | my @tab = split /\t/, $1; | |
68 | my $compat = $tab[2] =~ s/<[^>]+>//; | |
69 | my $dec = [ _getHexArray($tab[2]) ]; # decomposition | |
70 | my $com = pack('U*', @$dec); # composable sequence | |
71 | my $ini = hex($tab[0]); | |
72 | if($tab[1] eq '') | |
73 | { | |
74 | $Compat{ $ini } = $dec; | |
75 | if(! $compat){ | |
76 | $Canon{ $ini } = $dec; | |
77 | $Compos{ $com } = $ini; | |
78 | } | |
79 | } | |
80 | else | |
81 | { | |
82 | foreach my $u ($ini .. hex($tab[1])){ | |
83 | $Compat{ $u } = $dec; | |
84 | if(! $compat){ | |
85 | $Canon{ $u } = $dec; | |
86 | $Compos{ $com } = $ini; | |
87 | } | |
88 | } | |
89 | } | |
90 | } | |
91 | ||
92 | foreach my $key (keys %Canon) # exhaustive decomposition | |
93 | { | |
94 | $Canon{$key} = [ getCanonList($key) ]; | |
95 | } | |
96 | ||
97 | foreach my $key (keys %Compat) # exhaustive decomposition | |
98 | { | |
99 | $Compat{$key} = [ getCompatList($key) ]; | |
100 | } | |
101 | ||
102 | sub getCanonList | |
103 | { | |
104 | my @src = @_; | |
105 | my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src; | |
106 | join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); | |
107 | # condition @src == @dec is not ok. | |
108 | } | |
109 | ||
110 | sub getCompatList | |
111 | { | |
112 | my @src = @_; | |
113 | my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src; | |
114 | join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); | |
115 | # condition @src == @dec is not ok. | |
116 | } | |
117 | ||
118 | sub NFD($){ _decompose(shift, 0) } | |
119 | ||
120 | sub NFKD($){ _decompose(shift, 1) } | |
121 | ||
122 | sub NFC($){ _compose(NFD(shift)) } | |
123 | ||
124 | sub NFKC($){ _compose(NFKD(shift)) } | |
125 | ||
126 | sub normalize($$) | |
127 | { | |
128 | my($form,$str) = @_; | |
129 | $form eq 'D' || $form eq 'NFD' ? NFD($str) : | |
130 | $form eq 'C' || $form eq 'NFC' ? NFC($str) : | |
131 | $form eq 'KD' || $form eq 'NFKD' ? NFKD($str) : | |
132 | $form eq 'KC' || $form eq 'NFKC' ? NFKC($str) : | |
133 | croak $PACKAGE."::normalize: invalid form name: $form"; | |
134 | } | |
135 | ||
136 | ||
137 | ## | |
138 | ## string _decompose(string, compat?) | |
139 | ## | |
140 | sub _decompose | |
141 | { | |
142 | my $str = $_[0]; | |
143 | my $hash = $_[1] ? \%Compat : \%Canon; | |
144 | my @ret; | |
145 | my $retstr=""; | |
146 | foreach my $u (unpack 'U*', $str){ | |
147 | push @ret, | |
148 | $hash->{ $u } ? @{ $hash->{ $u } } : | |
149 | _isHangul($u) ? decomposeHangul($u) : $u; | |
150 | } | |
151 | for(my $i=0; $i<@ret;){ | |
152 | $retstr .= pack('U', $ret[$i++]), next | |
153 | unless $Combin{ $ret[$i] } && $i+1 < @ret && $Combin{ $ret[$i+1] }; | |
154 | my @tmp; | |
155 | push(@tmp, $ret[$i++]) while $i < @ret && $Combin{ $ret[$i] }; | |
156 | $retstr .= pack 'U*', @tmp[ | |
157 | sort { | |
158 | $Combin{ $tmp[$a] } <=> $Combin{ $tmp[$b] } || $a <=> $b | |
159 | } 0 .. @tmp - 1, | |
160 | ]; | |
161 | } | |
162 | $retstr; | |
163 | } | |
164 | ||
165 | ## | |
166 | ## string _compose(string) | |
167 | ## | |
168 | ## S : starter; NS : not starter; | |
169 | ## | |
170 | ## composable sequence begins at S. | |
171 | ## S + S or (S + S) + S may be composed. | |
172 | ## NS + NS must not be composed. | |
173 | ## | |
174 | sub _compose | |
175 | { | |
176 | my @src = unpack('U*', composeHangul shift); # get codepoints | |
177 | for(my $s = 0; $s+1 < @src; $s++){ | |
178 | next unless defined $src[$s] && ! $Combin{ $src[$s] }; # S only | |
179 | my($c, $blocked); | |
180 | for(my $j = $s+1; $j < @src && !$blocked; $j++){ | |
181 | $blocked = 1 if ! $Combin{ $src[$j] }; | |
182 | ||
183 | next if $j != $s + 1 && defined $src[$j-1] | |
184 | && $Combin{ $src[$j-1] } && $Combin{ $src[$j] } | |
185 | && $Combin{ $src[$j-1] } == $Combin{ $src[$j] }; | |
186 | ||
187 | if( # $c != 0, maybe. | |
188 | $c = $Compos{pack('U*', @src[$s,$j])} and ! $Exclus{$c} | |
189 | ) | |
190 | { | |
191 | $src[$s] = $c; $src[$j] = undef; $blocked = 0; | |
192 | } | |
193 | } | |
194 | } | |
195 | pack 'U*', grep defined(), @src; | |
196 | } | |
197 | ||
198 | ## | |
199 | ## "hhhh hhhh hhhh" to (dddd, dddd, dddd) | |
200 | ## | |
201 | sub _getHexArray | |
202 | { | |
203 | my $str = shift; | |
204 | map hex(), $str =~ /([0-9A-Fa-f]+)/g; | |
205 | } | |
206 | ||
207 | ## | |
208 | ## Hangul Syllables | |
209 | ## | |
210 | sub _isHangul | |
211 | { | |
212 | my $code = shift; | |
213 | return 0xAC00 <= $code && $code <= 0xD7A3; | |
214 | } | |
215 | ||
216 | ## | |
217 | ## for Debug | |
218 | ## | |
219 | sub _getCombin { wantarray ? %Combin : \%Combin } | |
220 | sub _getCanon { wantarray ? %Canon : \%Canon } | |
221 | sub _getCompat { wantarray ? %Compat : \%Compat } | |
222 | sub _getCompos { wantarray ? %Compos : \%Compos } | |
223 | sub _getExclus { wantarray ? %Exclus : \%Exclus } | |
224 | 1; | |
225 | __END__ | |
226 | ||
227 | =head1 NAME | |
228 | ||
229 | Unicode::Normalize - normalized forms of Unicode text | |
230 | ||
231 | =head1 SYNOPSIS | |
232 | ||
233 | use Unicode::Normalize; | |
234 | ||
235 | $string_NFD = NFD($raw_string); # Normalization Form D | |
236 | $string_NFC = NFC($raw_string); # Normalization Form C | |
237 | $string_NFKD = NFKD($raw_string); # Normalization Form KD | |
238 | $string_NFKC = NFKC($raw_string); # Normalization Form KC | |
239 | ||
240 | or | |
241 | ||
242 | use Unicode::Normalize 'normalize'; | |
243 | ||
244 | $string_NFD = normalize('D', $raw_string); # Normalization Form D | |
245 | $string_NFC = normalize('C', $raw_string); # Normalization Form C | |
246 | $string_NFKD = normalize('KD', $raw_string); # Normalization Form KD | |
247 | $string_NFKC = normalize('KC', $raw_string); # Normalization Form KC | |
248 | ||
249 | =head1 DESCRIPTION | |
250 | ||
251 | =over 4 | |
252 | ||
253 | =item C<$string_NFD = NFD($raw_string)> | |
254 | ||
255 | returns the Normalization Form D (formed by canonical decomposition). | |
256 | ||
257 | ||
258 | =item C<$string_NFC = NFC($raw_string)> | |
259 | ||
260 | returns the Normalization Form C (formed by canonical decomposition | |
261 | followed by canonical composition). | |
262 | ||
263 | =item C<$string_NFKD = NFKD($raw_string)> | |
264 | ||
265 | returns the Normalization Form KD (formed by compatibility decomposition). | |
266 | ||
267 | =item C<$string_NFKC = NFKC($raw_string)> | |
268 | ||
269 | returns the Normalization Form KC (formed by compatibility decomposition | |
270 | followed by B<canonical> composition). | |
271 | ||
272 | =item C<$normalized_string = normalize($form_name, $raw_string)> | |
273 | ||
274 | As C<$form_name>, one of the following names must be given. | |
275 | ||
276 | 'C' or 'NFC' for Normalization Form C | |
277 | 'D' or 'NFD' for Normalization Form D | |
278 | 'KC' or 'NFKC' for Normalization Form KC | |
279 | 'KD' or 'NFKD' for Normalization Form KD | |
280 | ||
281 | =back | |
282 | ||
283 | =head2 EXPORT | |
284 | ||
285 | C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. | |
286 | ||
287 | C<normalize>: on request. | |
288 | ||
289 | =head1 AUTHOR | |
290 | ||
291 | SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt> | |
292 | ||
293 | http://homepage1.nifty.com/nomenclator/perl/ | |
294 | ||
295 | Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved. | |
296 | ||
297 | This program is free software; you can redistribute it and/or | |
298 | modify it under the same terms as Perl itself. | |
299 | ||
300 | =head1 SEE ALSO | |
301 | ||
302 | =over 4 | |
303 | ||
304 | =item L<Lingua::KO::Hangul::Util> | |
305 | ||
306 | utility functions for Hangul Syllables | |
307 | ||
308 | =item http://www.unicode.org/unicode/reports/tr15/ | |
309 | ||
310 | Unicode Normalization Forms - UAX #15 | |
311 | ||
312 | =back | |
313 | ||
314 | =cut |