[perl5.git] / lib / Unicode / Normalize.pm

package Unicode::Normalize;

use 5.006;
use strict;
use warnings;
use Carp;
use Lingua::KO::Hangul::Util;

our $VERSION = '0.04';
our $PACKAGE = __PACKAGE__;

require Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw( NFC NFD NFKC NFKD );
our @EXPORT_OK = qw( normalize );
our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );

our $Combin = do "unicore/CombiningClass.pl"
           || do "unicode/CombiningClass.pl"
           || croak "$PACKAGE: CombiningClass.pl not found";

our $Decomp = do "unicore/Decomposition.pl"
           || do "unicode/Decomposition.pl"
           || croak "$PACKAGE: Decomposition.pl not found";

our %Combin; # $codepoint => $number      : combination class
our %Canon;  # $codepoint => \@codepoints : canonical decomp.
our %Compat; # $codepoint => \@codepoints : compat. decomp.
our %Compos; # $string    => $codepoint   : composite
our %Exclus; # $codepoint => 1            : composition exclusions

{
  my($f, $fh);
  foreach my $d (@INC) {
    use File::Spec;
    $f = File::Spec->catfile($d, "unicore", "CompExcl.txt");
    last if open($fh, $f);
    $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
    last if open($fh, $f);
    $f = undef;
  }
  croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f;
  while(<$fh>){
    next if /^#/ or /^$/;
    s/#.*//;
    $Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/;
  }
  close $fh;
}

while($Combin =~ /(.+)/g)
{
  my @tab = split /\t/, $1;
  my $ini = hex $tab[0];
  if($tab[1] eq '')
  {
    $Combin{ $ini } = $tab[2];
  }
  else
  {
    $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
  }
}

while($Decomp =~ /(.+)/g)
{
  my @tab = split /\t/, $1;
  my $compat = $tab[2] =~ s/<[^>]+>//;
  my $dec = [ _getHexArray($tab[2]) ]; # decomposition
  my $com = pack('U*', @$dec); # composable sequence
  my $ini = hex($tab[0]);
  if($tab[1] eq '')
  {
    $Compat{ $ini } = $dec;
    if(! $compat){
      $Canon{  $ini } = $dec;
      $Compos{ $com } = $ini;
    }
  }
  else
  {
    foreach my $u ($ini .. hex($tab[1])){
      $Compat{ $u } = $dec;
      if(! $compat){
        $Canon{  $u }   = $dec;
        $Compos{ $com } = $ini;
      }
    }
  }
}

foreach my $key (keys %Canon)  # exhaustive decomposition
{
   $Canon{$key}  = [ getCanonList($key) ];
}

foreach my $key (keys %Compat) # exhaustive decomposition
{
   $Compat{$key} = [ getCompatList($key) ];
}

sub getCanonList
{
  my @src = @_;
  my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src;
  join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
  # condition @src == @dec is not ok.
}

sub getCompatList
{
  my @src = @_;
  my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src;
  join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
  # condition @src == @dec is not ok.
}

sub NFD($){ _decompose(shift, 0) }

sub NFKD($){ _decompose(shift, 1) }

sub NFC($){ _compose(NFD(shift)) }

sub NFKC($){ _compose(NFKD(shift)) }

sub normalize($$)
{
  my($form,$str) = @_;
  $form eq 'D'  || $form eq 'NFD'  ? NFD($str) :
  $form eq 'C'  || $form eq 'NFC'  ? NFC($str) :
  $form eq 'KD' || $form eq 'NFKD' ? NFKD($str) :
  $form eq 'KC' || $form eq 'NFKC' ? NFKC($str) :
    croak $PACKAGE."::normalize: invalid form name: $form";
}


##
## string _decompose(string, compat?)
##
sub _decompose
{
  my $str  = $_[0];
  my $hash = $_[1] ? \%Compat : \%Canon;
  my @ret;
  my $retstr="";
  foreach my $u (unpack 'U*', $str){
    push @ret,
      $hash->{ $u }  ? @{ $hash->{ $u } } :
      _isHangul($u) ? decomposeHangul($u) : $u;
  }
  for(my $i=0; $i<@ret;){
    $retstr .= pack('U', $ret[$i++]), next
       unless $Combin{ $ret[$i] } && $i+1 < @ret && $Combin{ $ret[$i+1] };
    my @tmp;
    push(@tmp, $ret[$i++]) while $i < @ret && $Combin{ $ret[$i] };
    $retstr .= pack 'U*', @tmp[
      sort {
        $Combin{ $tmp[$a] } <=> $Combin{ $tmp[$b] } || $a <=> $b
      } 0 .. @tmp - 1,
    ];
  }
  $retstr;
}

##
## string _compose(string)
##
## S : starter; NS : not starter;
##
## composable sequence begins at S.
## S + S or (S + S) + S may be composed.
## NS + NS must not be composed.
##
sub _compose
{
  my @src = unpack('U*', composeHangul shift); # get codepoints
  for(my $s = 0; $s+1 < @src; $s++){
    next unless defined $src[$s] && ! $Combin{ $src[$s] }; # S only
    my($c, $blocked);
    for(my $j = $s+1; $j < @src && !$blocked; $j++){
      $blocked = 1 if ! $Combin{ $src[$j] };

      next if $j != $s + 1 && defined $src[$j-1]
        && $Combin{ $src[$j-1] } && $Combin{ $src[$j] } 
        && $Combin{ $src[$j-1] } == $Combin{ $src[$j] };

      if(  # $c != 0, maybe.
        $c = $Compos{pack('U*', @src[$s,$j])} and ! $Exclus{$c}
      )
      {
        $src[$s] = $c; $src[$j] = undef; $blocked = 0;
      }
    }
  }
  pack 'U*', grep defined(), @src;
}

##
## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
##
sub _getHexArray
{
  my $str = shift;
  map hex(), $str =~ /([0-9A-Fa-f]+)/g;
}

##
## Hangul Syllables
##
sub _isHangul
{
  my $code = shift;
  return 0xAC00 <= $code && $code <= 0xD7A3;
}

##
## for Debug
##
sub _getCombin { wantarray ? %Combin : \%Combin }
sub _getCanon  { wantarray ? %Canon  : \%Canon  }
sub _getCompat { wantarray ? %Compat : \%Compat }
sub _getCompos { wantarray ? %Compos : \%Compos }
sub _getExclus { wantarray ? %Exclus : \%Exclus }
1;
__END__

=head1 NAME

Unicode::Normalize - normalized forms of Unicode text

=head1 SYNOPSIS

  use Unicode::Normalize;

  $string_NFD  = NFD($raw_string);  # Normalization Form D
  $string_NFC  = NFC($raw_string);  # Normalization Form C
  $string_NFKD = NFKD($raw_string); # Normalization Form KD
  $string_NFKC = NFKC($raw_string); # Normalization Form KC

   or

  use Unicode::Normalize 'normalize';

  $string_NFD  = normalize('D',  $raw_string);  # Normalization Form D
  $string_NFC  = normalize('C',  $raw_string);  # Normalization Form C
  $string_NFKD = normalize('KD', $raw_string);  # Normalization Form KD
  $string_NFKC = normalize('KC', $raw_string);  # Normalization Form KC

=head1 DESCRIPTION

=over 4

=item C<$string_NFD = NFD($raw_string)>

returns the Normalization Form D (formed by canonical decomposition).


=item C<$string_NFC = NFC($raw_string)>

returns the Normalization Form C (formed by canonical decomposition
followed by canonical composition).

=item C<$string_NFKD = NFKD($raw_string)>

returns the Normalization Form KD (formed by compatibility decomposition).

=item C<$string_NFKC = NFKC($raw_string)>

returns the Normalization Form KC (formed by compatibility decomposition
followed by B<canonical> composition).

=item C<$normalized_string = normalize($form_name, $raw_string)>

As C<$form_name>, one of the following names must be given.

  'C'  or 'NFC'  for Normalization Form C
  'D'  or 'NFD'  for Normalization Form D
  'KC' or 'NFKC' for Normalization Form KC
  'KD' or 'NFKD' for Normalization Form KD

=back

=head2 EXPORT

C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.

C<normalize>: on request.

=head1 AUTHOR

SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>

  http://homepage1.nifty.com/nomenclator/perl/

  Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.

  This program is free software; you can redistribute it and/or 
  modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item L<Lingua::KO::Hangul::Util>

utility functions for Hangul Syllables

=item http://www.unicode.org/unicode/reports/tr15/

Unicode Normalization Forms - UAX #15

=back

=cut
Commit	Line	Data
45394607 JH	1	package Unicode::Normalize;
	2
	3	use 5.006;
	4	use strict;
	5	use warnings;
	6	use Carp;
	7	use Lingua::KO::Hangul::Util;
	8
	9	our $VERSION = '0.04';
	10	our $PACKAGE = __PACKAGE__;
	11
	12	require Exporter;
	13	our @ISA = qw(Exporter);
	14	our @EXPORT = qw( NFC NFD NFKC NFKD );
	15	our @EXPORT_OK = qw( normalize );
	16	our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
	17
	18	our $Combin = do "unicore/CombiningClass.pl"
	19	\|\| do "unicode/CombiningClass.pl"
	20	\|\| croak "$PACKAGE: CombiningClass.pl not found";
	21
	22	our $Decomp = do "unicore/Decomposition.pl"
	23	\|\| do "unicode/Decomposition.pl"
	24	\|\| croak "$PACKAGE: Decomposition.pl not found";
	25
	26	our %Combin; # $codepoint => $number : combination class
	27	our %Canon; # $codepoint => \@codepoints : canonical decomp.
	28	our %Compat; # $codepoint => \@codepoints : compat. decomp.
	29	our %Compos; # $string => $codepoint : composite
	30	our %Exclus; # $codepoint => 1 : composition exclusions
	31
	32	{
	33	my($f, $fh);
	34	foreach my $d (@INC) {
	35	use File::Spec;
	36	$f = File::Spec->catfile($d, "unicore", "CompExcl.txt");
	37	last if open($fh, $f);
	38	$f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
	39	last if open($fh, $f);
	40	$f = undef;
	41	}
	42	croak "$PACKAGE: CompExcl.txt not found in @INC" unless defined $f;
	43	while(<$fh>){
	44	next if /^#/ or /^$/;
	45	s/#.*//;
	46	$Exclus{ hex($1) } =1 if /([0-9A-Fa-f]+)/;
	47	}
	48	close $fh;
	49	}
	50
	51	while($Combin =~ /(.+)/g)
	52	{
	53	my @tab = split /\t/, $1;
	54	my $ini = hex $tab[0];
	55	if($tab[1] eq '')
	56	{
	57	$Combin{ $ini } = $tab[2];
	58	}
	59	else
	60	{
	61	$Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
	62	}
	63	}
	64
65	while($Decomp =~ /(.+)/g)
66	{
67	my @tab = split /\t/, $1;
68	my $compat = $tab[2] =~ s/<[^>]+>//;
69	my $dec = [ _getHexArray($tab[2]) ]; # decomposition
70	my $com = pack('U*', @$dec); # composable sequence
71	my $ini = hex($tab[0]);
72	if($tab[1] eq '')
73	{
74	$Compat{ $ini } = $dec;
75	if(! $compat){
76	$Canon{ $ini } = $dec;
77	$Compos{ $com } = $ini;
78	}
79	}
80	else
81	{
82	foreach my $u ($ini .. hex($tab[1])){
83	$Compat{ $u } = $dec;
84	if(! $compat){
85	$Canon{ $u } = $dec;
86	$Compos{ $com } = $ini;
87	}
88	}
89	}
90	}
91
92	foreach my $key (keys %Canon) # exhaustive decomposition
93	{
94	$Canon{$key} = [ getCanonList($key) ];
95	}
96
97	foreach my $key (keys %Compat) # exhaustive decomposition
98	{
99	$Compat{$key} = [ getCompatList($key) ];
100	}
101
102	sub getCanonList
103	{
104	my @src = @_;
105	my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src;
106	join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
107	# condition @src == @dec is not ok.
108	}
109
110	sub getCompatList
111	{
112	my @src = @_;
113	my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src;
114	join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
115	# condition @src == @dec is not ok.
116	}
117
118	sub NFD($){ _decompose(shift, 0) }
119
120	sub NFKD($){ _decompose(shift, 1) }
121
122	sub NFC($){ _compose(NFD(shift)) }
123
124	sub NFKC($){ _compose(NFKD(shift)) }
125
126	sub normalize($$)
127	{
128	my($form,$str) = @_;
129	$form eq 'D' \|\| $form eq 'NFD' ? NFD($str) :
130	$form eq 'C' \|\| $form eq 'NFC' ? NFC($str) :
131	$form eq 'KD' \|\| $form eq 'NFKD' ? NFKD($str) :
132	$form eq 'KC' \|\| $form eq 'NFKC' ? NFKC($str) :
133	croak $PACKAGE."::normalize: invalid form name: $form";
134	}
135
136
137	##
138	## string _decompose(string, compat?)
139	##
140	sub _decompose
141	{
142	my $str = $_[0];
143	my $hash = $_[1] ? \%Compat : \%Canon;
144	my @ret;
145	my $retstr="";
146	foreach my $u (unpack 'U*', $str){
147	push @ret,
148	$hash->{ $u } ? @{ $hash->{ $u } } :
149	_isHangul($u) ? decomposeHangul($u) : $u;
150	}
151	for(my $i=0; $i<@ret;){
152	$retstr .= pack('U', $ret[$i++]), next
153	unless $Combin{ $ret[$i] } && $i+1 < @ret && $Combin{ $ret[$i+1] };
154	my @tmp;
155	push(@tmp, $ret[$i++]) while $i < @ret && $Combin{ $ret[$i] };
156	$retstr .= pack 'U*', @tmp[
157	sort {
158	$Combin{ $tmp[$a] } <=> $Combin{ $tmp[$b] } \|\| $a <=> $b
159	} 0 .. @tmp - 1,
160	];
161	}
162	$retstr;
163	}
164
165	##
166	## string _compose(string)
167	##
168	## S : starter; NS : not starter;
169	##
170	## composable sequence begins at S.
171	## S + S or (S + S) + S may be composed.
172	## NS + NS must not be composed.
173	##
174	sub _compose
175	{
176	my @src = unpack('U*', composeHangul shift); # get codepoints
177	for(my $s = 0; $s+1 < @src; $s++){
178	next unless defined $src[$s] && ! $Combin{ $src[$s] }; # S only
179	my($c, $blocked);
180	for(my $j = $s+1; $j < @src && !$blocked; $j++){
181	$blocked = 1 if ! $Combin{ $src[$j] };
182
183	next if $j != $s + 1 && defined $src[$j-1]
184	&& $Combin{ $src[$j-1] } && $Combin{ $src[$j] }
185	&& $Combin{ $src[$j-1] } == $Combin{ $src[$j] };
186
187	if( # $c != 0, maybe.
188	$c = $Compos{pack('U*', @src[$s,$j])} and ! $Exclus{$c}
189	)
190	{
191	$src[$s] = $c; $src[$j] = undef; $blocked = 0;
192	}
193	}
194	}
195	pack 'U*', grep defined(), @src;
196	}
197
198	##
199	## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
200	##
201	sub _getHexArray
202	{
203	my $str = shift;
204	map hex(), $str =~ /([0-9A-Fa-f]+)/g;
205	}
206
207	##
208	## Hangul Syllables
209	##
210	sub _isHangul
211	{
212	my $code = shift;
213	return 0xAC00 <= $code && $code <= 0xD7A3;
214	}
215
216	##
217	## for Debug
218	##
219	sub _getCombin { wantarray ? %Combin : \%Combin }
220	sub _getCanon { wantarray ? %Canon : \%Canon }
221	sub _getCompat { wantarray ? %Compat : \%Compat }
222	sub _getCompos { wantarray ? %Compos : \%Compos }
223	sub _getExclus { wantarray ? %Exclus : \%Exclus }
224	1;
225	__END__
226
227	=head1 NAME
228
229	Unicode::Normalize - normalized forms of Unicode text
230
231	=head1 SYNOPSIS
232
233	use Unicode::Normalize;
234
235	$string_NFD = NFD($raw_string); # Normalization Form D
236	$string_NFC = NFC($raw_string); # Normalization Form C
237	$string_NFKD = NFKD($raw_string); # Normalization Form KD
238	$string_NFKC = NFKC($raw_string); # Normalization Form KC
239
240	or
241
242	use Unicode::Normalize 'normalize';
243
244	$string_NFD = normalize('D', $raw_string); # Normalization Form D
245	$string_NFC = normalize('C', $raw_string); # Normalization Form C
246	$string_NFKD = normalize('KD', $raw_string); # Normalization Form KD
247	$string_NFKC = normalize('KC', $raw_string); # Normalization Form KC
248
249	=head1 DESCRIPTION
250
251	=over 4
252
253	=item C<$string_NFD = NFD($raw_string)>
254
255	returns the Normalization Form D (formed by canonical decomposition).
256
257
258	=item C<$string_NFC = NFC($raw_string)>
259
260	returns the Normalization Form C (formed by canonical decomposition
261	followed by canonical composition).
262
263	=item C<$string_NFKD = NFKD($raw_string)>
264
265	returns the Normalization Form KD (formed by compatibility decomposition).
266
267	=item C<$string_NFKC = NFKC($raw_string)>
268
269	returns the Normalization Form KC (formed by compatibility decomposition
270	followed by B<canonical> composition).
271
272	=item C<$normalized_string = normalize($form_name, $raw_string)>
273
274	As C<$form_name>, one of the following names must be given.
275
276	'C' or 'NFC' for Normalization Form C
277	'D' or 'NFD' for Normalization Form D
278	'KC' or 'NFKC' for Normalization Form KC
279	'KD' or 'NFKD' for Normalization Form KD
280
281	=back
282
283	=head2 EXPORT
284
285	C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
286
287	C<normalize>: on request.
288
289	=head1 AUTHOR
290
291	SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
292
293	http://homepage1.nifty.com/nomenclator/perl/
294
295	Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
296
297	This program is free software; you can redistribute it and/or
298	modify it under the same terms as Perl itself.
299
300	=head1 SEE ALSO
301
302	=over 4
303
304	=item L<Lingua::KO::Hangul::Util>
305
306	utility functions for Hangul Syllables
307
308	=item http://www.unicode.org/unicode/reports/tr15/
309
310	Unicode Normalization Forms - UAX #15
311
312	=back
313
314	=cut