3 # This auxiliary script makes five header files
4 # used for building XSUB of Unicode::Normalize.
7 # <do 'mkheader'> in perl, or <perl mkheader> in command line
10 # unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
11 # unicore/Decomposition.pl (or unicode/Decomposition.pl)
27 our $PACKAGE = 'Unicode::Normalize, mkheader';
30 our $structname = "${prefix}complist";
32 # Starting in v5.20, the tables in lib/unicore are built using the platform's
33 # native character set for code points 0-255. But in v5.35, pack U stopped
34 # trying to compensate
35 *pack_U = ($] ge 5.020 && $] lt 5.035)
36 ? sub { return pack('U*', map { utf8::unicode_to_native($_) } @_); }
37 : sub { return pack('U*', @_); };
39 # %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify()
40 our %Comp1st; # $codepoint => $listname : may be composed with a next char.
41 our %CompList; # $listname,$2nd => $codepoint : composite
43 ##### The below part is common to mkheader and PP #####
45 our %Combin; # $codepoint => $number : combination class
46 our %Canon; # $codepoint => \@codepoints : canonical decomp.
47 our %Compat; # $codepoint => \@codepoints : compat. decomp.
48 our %Compos; # $1st,$2nd => $codepoint : composite
49 our %Exclus; # $codepoint => 1 : composition exclusions
50 our %Single; # $codepoint => 1 : singletons
51 our %NonStD; # $codepoint => 1 : non-starter decompositions
52 our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
54 # from core Unicode database
55 our $Combin = do "unicore/CombiningClass.pl"
56 || do "unicode/CombiningClass.pl"
57 || croak "$PACKAGE: CombiningClass.pl not found";
58 our $Decomp = do "unicore/Decomposition.pl"
59 || do "unicode/Decomposition.pl"
60 || croak "$PACKAGE: Decomposition.pl not found";
62 # CompositionExclusions.txt since Unicode 3.2.0. If this ever changes, it
63 # would be better to get the values from Unicode::UCD rather than hard-code
64 # them here, as that will protect from having to make fixes for future
67 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
68 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
69 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
70 FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
71 FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
72 FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
73 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
76 # definition of Hangul constants
77 use constant SBase => 0xAC00;
78 use constant SFinal => 0xD7A3; # SBase -1 + SCount
79 use constant SCount => 11172; # LCount * NCount
80 use constant NCount => 588; # VCount * TCount
81 use constant LBase => 0x1100;
82 use constant LFinal => 0x1112;
83 use constant LCount => 19;
84 use constant VBase => 0x1161;
85 use constant VFinal => 0x1175;
86 use constant VCount => 21;
87 use constant TBase => 0x11A7;
88 use constant TFinal => 0x11C2;
89 use constant TCount => 28;
92 my $sindex = $_[0] - SBase;
93 my $lindex = int( $sindex / NCount);
94 my $vindex = int(($sindex % NCount) / TCount);
95 my $tindex = $sindex % TCount;
99 $tindex ? (TBase + $tindex) : (),
101 return wantarray ? @ret : pack_U(@ret);
104 ########## getting full decomposition ##########
106 ## converts string "hhhh hhhh hhhh" to a numeric list
107 ## (hex digits separated by spaces)
108 sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
110 while ($Combin =~ /(.+)/g) {
111 my @tab = split /\t/, $1;
112 my $ini = hex $tab[0];
114 $Combin{$ini} = $tab[2];
116 $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
120 while ($Decomp =~ /(.+)/g) {
121 my @tab = split /\t/, $1;
122 my $compat = $tab[2] =~ s/<[^>]+>//;
123 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
124 my $ini = hex($tab[0]); # initial decomposable character
125 my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
126 # ($ini .. $end) is the range of decomposable characters.
128 foreach my $u ($ini .. $end) {
130 $Canon{$u} = $dec if ! $compat;
134 for my $s (@CompEx) {
136 next if !$Canon{$u}; # not assigned
137 next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
141 foreach my $u (keys %Canon) {
142 my $dec = $Canon{$u};
145 if ($Combin{ $dec->[0] }) {
148 $Compos{ $dec->[0] }{ $dec->[1] } = $u;
149 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
151 } elsif (@$dec == 1) {
154 my $h = sprintf '%04X', $u;
155 croak("Weird Canonical Decomposition of U+$h");
159 # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
160 foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
167 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
168 : $Canon{$_} ? @{ $Canon{$_} } : $_
170 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
171 # condition @src == @dec is not ok.
177 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
178 : $Compat{$_} ? @{ $Compat{$_} } : $_
180 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
181 # condition @src == @dec is not ok.
184 # exhaustive decomposition
185 foreach my $key (keys %Canon) {
186 $Canon{$key} = [ getCanonList($key) ];
189 # exhaustive decomposition
190 foreach my $key (keys %Compat) {
191 $Compat{$key} = [ getCompatList($key) ];
194 ##### The above part is common to mkheader and PP #####
196 foreach my $comp1st (keys %Compos) {
197 my $listname = sprintf("${structname}_%06x", $comp1st);
198 # %04x is bad since it'd place _3046 after _1d157.
199 $Comp1st{$comp1st} = $listname;
200 my $rh1st = $Compos{$comp1st};
202 foreach my $comp2nd (keys %$rh1st) {
203 my $uc = $rh1st->{$comp2nd};
204 $CompList{$listname}{$comp2nd} = $uc;
208 sub split_into_char {
211 my $len = length($uni);
213 for(my $i = 0; $i < $len; ++$i) {
214 push @ary, ord(substr($uni,$i,1));
220 sprintf '"%s"', join '',
221 map sprintf("\\x%02x", $_), split_into_char(pack_U(@_));
224 foreach my $hash (\%Canon, \%Compat) {
225 foreach my $key (keys %$hash) {
226 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
230 ########## writing header files ##########
244 name => "NonStDecomp",
255 my $orig_fh = SelectSaver->new;
258 my $file = "unfexc.h";
259 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
260 binmode FH; select FH;
264 * This file is auto-generated by mkheader.
265 * Any changes here will be lost!
269 foreach my $tbl (@boolfunc) {
270 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
271 my $type = $tbl->{type};
272 my $name = $tbl->{name};
273 print "$type is$name (UV uv)\n{\nreturn\n\t";
276 my $cur = shift @temp;
277 if (@temp && $cur + 1 == $temp[0]) {
278 print "($cur <= uv && uv <= ";
279 while (@temp && $cur + 1 == $temp[0]) {
283 print "\n\t|| " if @temp;
286 print "\n\t|| " if @temp;
289 print "\n\t? TRUE : FALSE;\n}\n\n";
294 ####################################
297 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
299 foreach my $i (sort keys %CompList) {
300 $compinit .= "$structname $i [] = {\n";
301 $compinit .= join ",\n",
302 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
303 sort {$a <=> $b } keys %{ $CompList{$i} };
304 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
332 type => "$structname *",
339 foreach my $tbl (@tripletable) {
340 my $file = "$tbl->{file}.h";
341 my $head = "${prefix}$tbl->{name}";
342 my $type = $tbl->{type};
343 my $hash = $tbl->{hash};
344 my $null = $tbl->{null};
345 my $init = $tbl->{init};
347 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
348 binmode FH; select FH;
353 * This file is auto-generated by mkheader.
354 * Any changes here will be lost!
358 print $init if defined $init;
360 foreach my $uv (keys %$hash) {
361 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
362 unless $uv <= 0x10FFFF;
363 my @c = unpack 'CCCC', pack 'N', $uv;
364 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
367 foreach my $p (sort { $a <=> $b } keys %val) {
368 next if ! $val{ $p };
369 for (my $r = 0; $r < 256; $r++) {
370 next if ! $val{ $p }{ $r };
371 printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
372 for (my $c = 0; $c < 256; $c++) {
373 print "\t", defined $val{$p}{$r}{$c}
374 ? "($type)".$val{$p}{$r}{$c}
376 print ',' if $c != 255;
377 print "\n" if $c % 8 == 7;
382 foreach my $p (sort { $a <=> $b } keys %val) {
383 next if ! $val{ $p };
384 printf "static $type* ${head}_%02x [256] = {\n", $p;
385 for (my $r = 0; $r < 256; $r++) {
386 print $val{ $p }{ $r }
387 ? sprintf("${head}_%02x_%02x", $p, $r)
389 print ',' if $r != 255;
390 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
394 print "static $type** $head [] = {\n";
395 for (my $p = 0; $p <= 0x10; $p++) {
396 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
397 print ',' if $p != 0x10;
404 } # End of block for SelectSaver