3 # This auxiliary script makes five header files
4 # used for building XSUB of Unicode::Normalize.
7 # <do 'mkheader'> in perl, or <perl mkheader> in command line
10 # unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
11 # unicore/Decomposition.pl (or unicode/Decomposition.pl)
27 unless ("A" eq pack('U', 0x41)) {
28 die "Unicode::Normalize cannot stringify a Unicode code point\n";
32 our $PACKAGE = 'Unicode::Normalize, mkheader';
35 our $structname = "${prefix}complist";
38 return pack('U*', @_);
41 # %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify()
42 our %Comp1st; # $codepoint => $listname : may be composed with a next char.
43 our %CompList; # $listname,$2nd => $codepoint : composite
45 ##### The below part is common to mkheader and PP #####
47 our %Combin; # $codepoint => $number : combination class
48 our %Canon; # $codepoint => \@codepoints : canonical decomp.
49 our %Compat; # $codepoint => \@codepoints : compat. decomp.
50 our %Compos; # $1st,$2nd => $codepoint : composite
51 our %Exclus; # $codepoint => 1 : composition exclusions
52 our %Single; # $codepoint => 1 : singletons
53 our %NonStD; # $codepoint => 1 : non-starter decompositions
54 our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
56 # from core Unicode database
57 our $Combin = do "unicore/CombiningClass.pl"
58 || do "unicode/CombiningClass.pl"
59 || croak "$PACKAGE: CombiningClass.pl not found";
60 our $Decomp = do "unicore/Decomposition.pl"
61 || do "unicode/Decomposition.pl"
62 || croak "$PACKAGE: Decomposition.pl not found";
64 # CompositionExclusions.txt since Unicode 3.2.0
66 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
67 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
68 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
69 FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
70 FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
71 FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
72 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
75 # definition of Hangul constants
76 use constant SBase => 0xAC00;
77 use constant SFinal => 0xD7A3; # SBase -1 + SCount
78 use constant SCount => 11172; # LCount * NCount
79 use constant NCount => 588; # VCount * TCount
80 use constant LBase => 0x1100;
81 use constant LFinal => 0x1112;
82 use constant LCount => 19;
83 use constant VBase => 0x1161;
84 use constant VFinal => 0x1175;
85 use constant VCount => 21;
86 use constant TBase => 0x11A7;
87 use constant TFinal => 0x11C2;
88 use constant TCount => 28;
91 my $sindex = $_[0] - SBase;
92 my $lindex = int( $sindex / NCount);
93 my $vindex = int(($sindex % NCount) / TCount);
94 my $tindex = $sindex % TCount;
98 $tindex ? (TBase + $tindex) : (),
100 return wantarray ? @ret : pack_U(@ret);
103 ########## getting full decomposition ##########
105 ## converts string "hhhh hhhh hhhh" to a numeric list
106 ## (hex digits separated by spaces)
107 sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
109 while ($Combin =~ /(.+)/g) {
110 my @tab = split /\t/, $1;
111 my $ini = hex $tab[0];
113 $Combin{$ini} = $tab[2];
115 $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
119 while ($Decomp =~ /(.+)/g) {
120 my @tab = split /\t/, $1;
121 my $compat = $tab[2] =~ s/<[^>]+>//;
122 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
123 my $ini = hex($tab[0]); # initial decomposable character
124 my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
125 # ($ini .. $end) is the range of decomposable characters.
127 foreach my $u ($ini .. $end) {
129 $Canon{$u} = $dec if ! $compat;
133 for my $s (@CompEx) {
135 next if !$Canon{$u}; # not assigned
136 next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
140 foreach my $u (keys %Canon) {
141 my $dec = $Canon{$u};
144 if ($Combin{ $dec->[0] }) {
147 $Compos{ $dec->[0] }{ $dec->[1] } = $u;
148 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
150 } elsif (@$dec == 1) {
153 my $h = sprintf '%04X', $u;
154 croak("Weird Canonical Decomposition of U+$h");
158 # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
159 foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
166 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
167 : $Canon{$_} ? @{ $Canon{$_} } : $_
169 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
170 # condition @src == @dec is not ok.
176 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
177 : $Compat{$_} ? @{ $Compat{$_} } : $_
179 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
180 # condition @src == @dec is not ok.
183 # exhaustive decomposition
184 foreach my $key (keys %Canon) {
185 $Canon{$key} = [ getCanonList($key) ];
188 # exhaustive decomposition
189 foreach my $key (keys %Compat) {
190 $Compat{$key} = [ getCompatList($key) ];
193 ##### The above part is common to mkheader and PP #####
195 foreach my $comp1st (keys %Compos) {
196 my $listname = sprintf("${structname}_%06x", $comp1st);
197 # %04x is bad since it'd place _3046 after _1d157.
198 $Comp1st{$comp1st} = $listname;
199 my $rh1st = $Compos{$comp1st};
201 foreach my $comp2nd (keys %$rh1st) {
202 my $uc = $rh1st->{$comp2nd};
203 $CompList{$listname}{$comp2nd} = $uc;
207 sub split_into_char {
210 my $len = length($uni);
212 for(my $i = 0; $i < $len; ++$i) {
213 push @ary, ord(substr($uni,$i,1));
219 sprintf '"%s"', join '',
220 map sprintf("\\x%02x", $_), split_into_char(pack_U(@_));
223 foreach my $hash (\%Canon, \%Compat) {
224 foreach my $key (keys %$hash) {
225 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
229 ########## writing header files ##########
243 name => "NonStDecomp",
254 my $file = "unfexc.h";
255 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
256 binmode FH; select FH;
260 * This file is auto-generated by mkheader.
261 * Any changes here will be lost!
265 foreach my $tbl (@boolfunc) {
266 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
267 my $type = $tbl->{type};
268 my $name = $tbl->{name};
269 print "$type is$name (UV uv)\n{\nreturn\n\t";
272 my $cur = shift @temp;
273 if (@temp && $cur + 1 == $temp[0]) {
274 print "($cur <= uv && uv <= ";
275 while (@temp && $cur + 1 == $temp[0]) {
279 print "\n\t|| " if @temp;
282 print "\n\t|| " if @temp;
285 print "\n\t? TRUE : FALSE;\n}\n\n";
290 ####################################
293 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
295 foreach my $i (sort keys %CompList) {
296 $compinit .= "$structname $i [] = {\n";
297 $compinit .= join ",\n",
298 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
299 sort {$a <=> $b } keys %{ $CompList{$i} };
300 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
328 type => "$structname *",
335 foreach my $tbl (@tripletable) {
336 my $file = "$tbl->{file}.h";
337 my $head = "${prefix}$tbl->{name}";
338 my $type = $tbl->{type};
339 my $hash = $tbl->{hash};
340 my $null = $tbl->{null};
341 my $init = $tbl->{init};
343 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
344 binmode FH; select FH;
349 * This file is auto-generated by mkheader.
350 * Any changes here will be lost!
354 print $init if defined $init;
356 foreach my $uv (keys %$hash) {
357 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
358 unless $uv <= 0x10FFFF;
359 my @c = unpack 'CCCC', pack 'N', $uv;
360 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
363 foreach my $p (sort { $a <=> $b } keys %val) {
364 next if ! $val{ $p };
365 for (my $r = 0; $r < 256; $r++) {
366 next if ! $val{ $p }{ $r };
367 printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
368 for (my $c = 0; $c < 256; $c++) {
369 print "\t", defined $val{$p}{$r}{$c}
370 ? "($type)".$val{$p}{$r}{$c}
372 print ',' if $c != 255;
373 print "\n" if $c % 8 == 7;
378 foreach my $p (sort { $a <=> $b } keys %val) {
379 next if ! $val{ $p };
380 printf "static $type* ${head}_%02x [256] = {\n", $p;
381 for (my $r = 0; $r < 256; $r++) {
382 print $val{ $p }{ $r }
383 ? sprintf("${head}_%02x_%02x", $p, $r)
385 print ',' if $r != 255;
386 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
390 print "static $type** $head [] = {\n";
391 for (my $p = 0; $p <= 0x10; $p++) {
392 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
393 print ',' if $p != 0x10;