3 # This auxiliary script makes five header files
4 # used for building XSUB of Unicode::Normalize.
7 # <do 'mkheader'> in perl, or <perl mkheader> in command line
10 # unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
11 # unicore/Decomposition.pl (or unicode/Decomposition.pl)
28 unless ('A' eq pack('U', 0x41)) {
29 die "Unicode::Normalize cannot stringify a Unicode code point\n";
31 unless (0x41 == unpack('U', 'A')) {
32 die "Unicode::Normalize cannot get Unicode code point\n";
36 our $PACKAGE = 'Unicode::Normalize, mkheader';
39 our $structname = "${prefix}complist";
41 # Starting in v5.20, the tables in lib/unicore are built using the platform's
42 # native character set for code points 0-255.
43 *pack_U = ($] ge 5.020)
44 ? sub { return pack('W*', @_).pack('U*'); } # The empty pack returns
45 # an empty UTF-8 string,
47 # force the return into
49 : sub { return pack('U*', @_); };
51 # %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify()
52 our %Comp1st; # $codepoint => $listname : may be composed with a next char.
53 our %CompList; # $listname,$2nd => $codepoint : composite
55 ##### The below part is common to mkheader and PP #####
57 our %Combin; # $codepoint => $number : combination class
58 our %Canon; # $codepoint => \@codepoints : canonical decomp.
59 our %Compat; # $codepoint => \@codepoints : compat. decomp.
60 our %Compos; # $1st,$2nd => $codepoint : composite
61 our %Exclus; # $codepoint => 1 : composition exclusions
62 our %Single; # $codepoint => 1 : singletons
63 our %NonStD; # $codepoint => 1 : non-starter decompositions
64 our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
66 # from core Unicode database
67 our $Combin = do "unicore/CombiningClass.pl"
68 || do "unicode/CombiningClass.pl"
69 || croak "$PACKAGE: CombiningClass.pl not found";
70 our $Decomp = do "unicore/Decomposition.pl"
71 || do "unicode/Decomposition.pl"
72 || croak "$PACKAGE: Decomposition.pl not found";
74 # CompositionExclusions.txt since Unicode 3.2.0. If this ever changes, it
75 # would be better to get the values from Unicode::UCD rather than hard-code
76 # them here, as that will protect from having to make fixes for future
79 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
80 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
81 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
82 FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
83 FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
84 FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
85 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
88 # definition of Hangul constants
89 use constant SBase => 0xAC00;
90 use constant SFinal => 0xD7A3; # SBase -1 + SCount
91 use constant SCount => 11172; # LCount * NCount
92 use constant NCount => 588; # VCount * TCount
93 use constant LBase => 0x1100;
94 use constant LFinal => 0x1112;
95 use constant LCount => 19;
96 use constant VBase => 0x1161;
97 use constant VFinal => 0x1175;
98 use constant VCount => 21;
99 use constant TBase => 0x11A7;
100 use constant TFinal => 0x11C2;
101 use constant TCount => 28;
103 sub decomposeHangul {
104 my $sindex = $_[0] - SBase;
105 my $lindex = int( $sindex / NCount);
106 my $vindex = int(($sindex % NCount) / TCount);
107 my $tindex = $sindex % TCount;
111 $tindex ? (TBase + $tindex) : (),
113 return wantarray ? @ret : pack_U(@ret);
116 ########## getting full decomposition ##########
118 ## converts string "hhhh hhhh hhhh" to a numeric list
119 ## (hex digits separated by spaces)
120 sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
122 while ($Combin =~ /(.+)/g) {
123 my @tab = split /\t/, $1;
124 my $ini = hex $tab[0];
126 $Combin{$ini} = $tab[2];
128 $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
132 while ($Decomp =~ /(.+)/g) {
133 my @tab = split /\t/, $1;
134 my $compat = $tab[2] =~ s/<[^>]+>//;
135 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
136 my $ini = hex($tab[0]); # initial decomposable character
137 my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
138 # ($ini .. $end) is the range of decomposable characters.
140 foreach my $u ($ini .. $end) {
142 $Canon{$u} = $dec if ! $compat;
146 for my $s (@CompEx) {
148 next if !$Canon{$u}; # not assigned
149 next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
153 foreach my $u (keys %Canon) {
154 my $dec = $Canon{$u};
157 if ($Combin{ $dec->[0] }) {
160 $Compos{ $dec->[0] }{ $dec->[1] } = $u;
161 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
163 } elsif (@$dec == 1) {
166 my $h = sprintf '%04X', $u;
167 croak("Weird Canonical Decomposition of U+$h");
171 # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
172 foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
179 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
180 : $Canon{$_} ? @{ $Canon{$_} } : $_
182 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
183 # condition @src == @dec is not ok.
189 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
190 : $Compat{$_} ? @{ $Compat{$_} } : $_
192 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
193 # condition @src == @dec is not ok.
196 # exhaustive decomposition
197 foreach my $key (keys %Canon) {
198 $Canon{$key} = [ getCanonList($key) ];
201 # exhaustive decomposition
202 foreach my $key (keys %Compat) {
203 $Compat{$key} = [ getCompatList($key) ];
206 ##### The above part is common to mkheader and PP #####
208 foreach my $comp1st (keys %Compos) {
209 my $listname = sprintf("${structname}_%06x", $comp1st);
210 # %04x is bad since it'd place _3046 after _1d157.
211 $Comp1st{$comp1st} = $listname;
212 my $rh1st = $Compos{$comp1st};
214 foreach my $comp2nd (keys %$rh1st) {
215 my $uc = $rh1st->{$comp2nd};
216 $CompList{$listname}{$comp2nd} = $uc;
220 sub split_into_char {
223 my $len = length($uni);
225 for(my $i = 0; $i < $len; ++$i) {
226 push @ary, ord(substr($uni,$i,1));
232 sprintf '"%s"', join '',
233 map sprintf("\\x%02x", $_), split_into_char(pack_U(@_));
236 foreach my $hash (\%Canon, \%Compat) {
237 foreach my $key (keys %$hash) {
238 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
242 ########## writing header files ##########
256 name => "NonStDecomp",
267 my $orig_fh = SelectSaver->new;
270 my $file = "unfexc.h";
271 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
272 binmode FH; select FH;
276 * This file is auto-generated by mkheader.
277 * Any changes here will be lost!
281 foreach my $tbl (@boolfunc) {
282 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
283 my $type = $tbl->{type};
284 my $name = $tbl->{name};
285 print "$type is$name (UV uv)\n{\nreturn\n\t";
288 my $cur = shift @temp;
289 if (@temp && $cur + 1 == $temp[0]) {
290 print "($cur <= uv && uv <= ";
291 while (@temp && $cur + 1 == $temp[0]) {
295 print "\n\t|| " if @temp;
298 print "\n\t|| " if @temp;
301 print "\n\t? TRUE : FALSE;\n}\n\n";
306 ####################################
309 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
311 foreach my $i (sort keys %CompList) {
312 $compinit .= "$structname $i [] = {\n";
313 $compinit .= join ",\n",
314 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
315 sort {$a <=> $b } keys %{ $CompList{$i} };
316 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
344 type => "$structname *",
351 foreach my $tbl (@tripletable) {
352 my $file = "$tbl->{file}.h";
353 my $head = "${prefix}$tbl->{name}";
354 my $type = $tbl->{type};
355 my $hash = $tbl->{hash};
356 my $null = $tbl->{null};
357 my $init = $tbl->{init};
359 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
360 binmode FH; select FH;
365 * This file is auto-generated by mkheader.
366 * Any changes here will be lost!
370 print $init if defined $init;
372 foreach my $uv (keys %$hash) {
373 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
374 unless $uv <= 0x10FFFF;
375 my @c = unpack 'CCCC', pack 'N', $uv;
376 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
379 foreach my $p (sort { $a <=> $b } keys %val) {
380 next if ! $val{ $p };
381 for (my $r = 0; $r < 256; $r++) {
382 next if ! $val{ $p }{ $r };
383 printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
384 for (my $c = 0; $c < 256; $c++) {
385 print "\t", defined $val{$p}{$r}{$c}
386 ? "($type)".$val{$p}{$r}{$c}
388 print ',' if $c != 255;
389 print "\n" if $c % 8 == 7;
394 foreach my $p (sort { $a <=> $b } keys %val) {
395 next if ! $val{ $p };
396 printf "static $type* ${head}_%02x [256] = {\n", $p;
397 for (my $r = 0; $r < 256; $r++) {
398 print $val{ $p }{ $r }
399 ? sprintf("${head}_%02x_%02x", $p, $r)
401 print ',' if $r != 255;
402 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
406 print "static $type** $head [] = {\n";
407 for (my $p = 0; $p <= 0x10; $p++) {
408 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
409 print ',' if $p != 0x10;
416 } # End of block for SelectSaver