#!perl
#
-# This script generates "unfcan.h", "unfcpt.h", "unfcmb.h",
-# "unfcmp.h", and "unfexc.h"
-# from CombiningClass.pl, Decomposition.pl, CompositionExclusions.txt
-# in lib/unicore or unicode directory
-# for Unicode::Normalize.xs. (cf. Makefile.PL)
+# This auxiliary script makes five header files
+# used for building XSUB of Unicode::Normalize.
#
-# Usage: <perl mkheader> in command line
-# or <do 'mkheader'> in perl
+# Usage:
+# <do 'mkheader'> in perl, or <perl mkheader> in command line
+#
+# Input files:
+# unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
+# unicore/Decomposition.pl (or unicode/Decomposition.pl)
+# unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
+#
+# Output files:
+# unfcan.h
+# unfcpt.h
+# unfcmb.h
+# unfcmp.h
+# unfexc.h
#
use 5.006;
use strict;
use warnings;
use Carp;
+use File::Spec;
+
+BEGIN {
+ unless ("A" eq pack('U', 0x41)) {
+ die "Unicode::Normalize cannot stringify a Unicode code point\n";
+ }
+}
our $PACKAGE = 'Unicode::Normalize, mkheader';
|| croak "$PACKAGE: Decomposition.pl not found";
our %Combin; # $codepoint => $number : combination class
-our %Canon; # $codepoint => $hexstring : canonical decomp.
-our %Compat; # $codepoint => $hexstring : compat. decomp.
+our %Canon; # $codepoint => \@codepoints : canonical decomp.
+our %Compat; # $codepoint => \@codepoints : compat. decomp.
+# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
our %Exclus; # $codepoint => 1 : composition exclusions
our %Single; # $codepoint => 1 : singletons
our %NonStD; # $codepoint => 1 : non-starter decompositions
our $prefix = "UNF_";
our $structname = "${prefix}complist";
+########## definition of Hangul constants ##########
+use constant SBase => 0xAC00;
+use constant SFinal => 0xD7A3; # SBase -1 + SCount
+use constant SCount => 11172; # LCount * NCount
+use constant NCount => 588; # VCount * TCount
+use constant LBase => 0x1100;
+use constant LFinal => 0x1112;
+use constant LCount => 19;
+use constant VBase => 0x1161;
+use constant VFinal => 0x1175;
+use constant VCount => 21;
+use constant TBase => 0x11A7;
+use constant TFinal => 0x11C2;
+use constant TCount => 28;
+
+sub decomposeHangul {
+ my $SIndex = $_[0] - SBase;
+ my $LIndex = int( $SIndex / NCount);
+ my $VIndex = int(($SIndex % NCount) / TCount);
+ my $TIndex = $SIndex % TCount;
+ my @ret = (
+ LBase + $LIndex,
+ VBase + $VIndex,
+ $TIndex ? (TBase + $TIndex) : (),
+ );
+ wantarray ? @ret : pack('U*', @ret);
+ # any element in @ret greater than 0xFF, so no need of u2n conversion.
+}
+
+########## getting full decomposion ##########
{
my($f, $fh);
foreach my $d (@INC) {
- use File::Spec;
$f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
last if open($fh, $f);
$f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
sub getCanonList {
my @src = @_;
- my @dec = map $Canon{$_} ? @{ $Canon{$_} } : $_, @src;
+ my @dec = map {
+ (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
+ : $Canon{$_} ? @{ $Canon{$_} } : $_
+ } @src;
return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
# condition @src == @dec is not ok.
}
sub getCompatList {
my @src = @_;
- my @dec = map $Compat{$_} ? @{ $Compat{$_} } : $_, @src;
+ my @dec = map {
+ (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
+ : $Compat{$_} ? @{ $Compat{$_} } : $_
+ } @src;
return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
# condition @src == @dec is not ok.
}
}
# exhaustive decomposition
-foreach my $key (keys %Compat) {
+foreach my $key (keys %Compat) {
$Compat{$key} = [ getCompatList($key) ];
}
+sub _pack_U {
+ return pack('U*', @_);
+}
+
sub _U_stringify {
sprintf '"%s"', join '',
- map sprintf("\\x%02x", $_), unpack 'C*', pack 'U*', @_;
+ map sprintf("\\x%02x", $_), unpack 'C*', _pack_U(@_);
}
-# Do we need say <pack 'U*', map utf8::unicode_to_native($_),>
-# instead of <pack 'U*',> for EBCDIC?
foreach my $hash (\%Canon, \%Compat) {
foreach my $key (keys %$hash) {
}
}
-####################################
+########## writing header files ##########
my @boolfunc = (
{
close FH;
}
+1;
__END__