Commit | Line | Data |
---|---|---|
ac5ea531 JH |
1 | #!perl |
2 | # | |
628bbff0 RGS |
3 | # This auxiliary script makes five header files |
4 | # used for building XSUB of Unicode::Normalize. | |
ac5ea531 | 5 | # |
628bbff0 RGS |
6 | # Usage: |
7 | # <do 'mkheader'> in perl, or <perl mkheader> in command line | |
8 | # | |
9 | # Input files: | |
10 | # unicore/CombiningClass.pl (or unicode/CombiningClass.pl) | |
11 | # unicore/Decomposition.pl (or unicode/Decomposition.pl) | |
628bbff0 RGS |
12 | # |
13 | # Output files: | |
14 | # unfcan.h | |
15 | # unfcpt.h | |
16 | # unfcmb.h | |
17 | # unfcmp.h | |
18 | # unfexc.h | |
f027f502 | 19 | # |
ac5ea531 JH |
20 | use 5.006; |
21 | use strict; | |
22 | use warnings; | |
23 | use Carp; | |
6c941e0c JH |
24 | use File::Spec; |
25 | ||
9f1f04a1 | 26 | BEGIN { |
1efaba7f | 27 | unless ("A" eq pack('U', 0x41)) { |
9f1f04a1 RGS |
28 | die "Unicode::Normalize cannot stringify a Unicode code point\n"; |
29 | } | |
30 | } | |
ac5ea531 JH |
31 | |
32 | our $PACKAGE = 'Unicode::Normalize, mkheader'; | |
33 | ||
a96160d6 CBW |
34 | our $prefix = "UNF_"; |
35 | our $structname = "${prefix}complist"; | |
ac5ea531 | 36 | |
a96160d6 CBW |
37 | sub pack_U { |
38 | return pack('U*', @_); | |
39 | } | |
40 | ||
41 | # %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify() | |
42 | our %Comp1st; # $codepoint => $listname : may be composed with a next char. | |
43 | our %CompList; # $listname,$2nd => $codepoint : composite | |
44 | ||
45 | ##### The below part is common to mkheader and PP ##### | |
ac5ea531 | 46 | |
48287974 | 47 | our %Combin; # $codepoint => $number : combination class |
6c941e0c JH |
48 | our %Canon; # $codepoint => \@codepoints : canonical decomp. |
49 | our %Compat; # $codepoint => \@codepoints : compat. decomp. | |
a96160d6 | 50 | our %Compos; # $1st,$2nd => $codepoint : composite |
48287974 TS |
51 | our %Exclus; # $codepoint => 1 : composition exclusions |
52 | our %Single; # $codepoint => 1 : singletons | |
53 | our %NonStD; # $codepoint => 1 : non-starter decompositions | |
48287974 | 54 | our %Comp2nd; # $codepoint => 1 : may be composed with a prev char. |
48287974 | 55 | |
e0a65de5 | 56 | # from core Unicode database |
a96160d6 CBW |
57 | our $Combin = do "unicore/CombiningClass.pl" |
58 | || do "unicode/CombiningClass.pl" | |
59 | || croak "$PACKAGE: CombiningClass.pl not found"; | |
60 | our $Decomp = do "unicore/Decomposition.pl" | |
61 | || do "unicode/Decomposition.pl" | |
62 | || croak "$PACKAGE: Decomposition.pl not found"; | |
ac5ea531 | 63 | |
e0a65de5 CBW |
64 | # CompositionExclusions.txt since Unicode 3.2.0 |
65 | our @CompEx = qw( | |
66 | 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36 | |
67 | 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76 | |
68 | 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D | |
69 | FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B | |
70 | FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C | |
71 | FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB | |
72 | 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0 | |
73 | ); | |
74 | ||
a96160d6 | 75 | # definition of Hangul constants |
6c941e0c JH |
76 | use constant SBase => 0xAC00; |
77 | use constant SFinal => 0xD7A3; # SBase -1 + SCount | |
78 | use constant SCount => 11172; # LCount * NCount | |
79 | use constant NCount => 588; # VCount * TCount | |
80 | use constant LBase => 0x1100; | |
81 | use constant LFinal => 0x1112; | |
82 | use constant LCount => 19; | |
83 | use constant VBase => 0x1161; | |
84 | use constant VFinal => 0x1175; | |
85 | use constant VCount => 21; | |
86 | use constant TBase => 0x11A7; | |
87 | use constant TFinal => 0x11C2; | |
88 | use constant TCount => 28; | |
89 | ||
90 | sub decomposeHangul { | |
a96160d6 CBW |
91 | my $sindex = $_[0] - SBase; |
92 | my $lindex = int( $sindex / NCount); | |
93 | my $vindex = int(($sindex % NCount) / TCount); | |
94 | my $tindex = $sindex % TCount; | |
6c941e0c | 95 | my @ret = ( |
a96160d6 CBW |
96 | LBase + $lindex, |
97 | VBase + $vindex, | |
98 | $tindex ? (TBase + $tindex) : (), | |
6c941e0c | 99 | ); |
a96160d6 | 100 | return wantarray ? @ret : pack_U(@ret); |
51683ce6 TS |
101 | } |
102 | ||
788ba0f4 | 103 | ########## getting full decomposition ########## |
ac5ea531 | 104 | |
48287974 | 105 | ## converts string "hhhh hhhh hhhh" to a numeric list |
a96160d6 CBW |
106 | ## (hex digits separated by spaces) |
107 | sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g } | |
48287974 | 108 | |
8f118dcd JH |
109 | while ($Combin =~ /(.+)/g) { |
110 | my @tab = split /\t/, $1; | |
111 | my $ini = hex $tab[0]; | |
112 | if ($tab[1] eq '') { | |
51683ce6 | 113 | $Combin{$ini} = $tab[2]; |
8f118dcd | 114 | } else { |
51683ce6 | 115 | $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]); |
8f118dcd | 116 | } |
ac5ea531 JH |
117 | } |
118 | ||
8f118dcd JH |
119 | while ($Decomp =~ /(.+)/g) { |
120 | my @tab = split /\t/, $1; | |
121 | my $compat = $tab[2] =~ s/<[^>]+>//; | |
122 | my $dec = [ _getHexArray($tab[2]) ]; # decomposition | |
a96160d6 | 123 | my $ini = hex($tab[0]); # initial decomposable character |
51683ce6 TS |
124 | my $end = $tab[1] eq '' ? $ini : hex($tab[1]); |
125 | # ($ini .. $end) is the range of decomposable characters. | |
48287974 | 126 | |
51683ce6 TS |
127 | foreach my $u ($ini .. $end) { |
128 | $Compat{$u} = $dec; | |
e0a65de5 CBW |
129 | $Canon{$u} = $dec if ! $compat; |
130 | } | |
131 | } | |
132 | ||
133 | for my $s (@CompEx) { | |
134 | my $u = hex $s; | |
135 | next if !$Canon{$u}; # not assigned | |
136 | next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2 | |
137 | $Exclus{$u} = 1; | |
138 | } | |
8f118dcd | 139 | |
e0a65de5 CBW |
140 | foreach my $u (keys %Canon) { |
141 | my $dec = $Canon{$u}; | |
142 | ||
143 | if (@$dec == 2) { | |
144 | if ($Combin{ $dec->[0] }) { | |
145 | $NonStD{$u} = 1; | |
146 | } else { | |
147 | $Compos{ $dec->[0] }{ $dec->[1] } = $u; | |
148 | $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; | |
8f118dcd | 149 | } |
e0a65de5 CBW |
150 | } elsif (@$dec == 1) { |
151 | $Single{$u} = 1; | |
152 | } else { | |
153 | my $h = sprintf '%04X', $u; | |
154 | croak("Weird Canonical Decomposition of U+$h"); | |
ac5ea531 | 155 | } |
ac5ea531 JH |
156 | } |
157 | ||
48287974 TS |
158 | # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo |
159 | foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { | |
160 | $Comp2nd{$j} = 1; | |
ac5ea531 JH |
161 | } |
162 | ||
163 | sub getCanonList { | |
8f118dcd | 164 | my @src = @_; |
6c941e0c JH |
165 | my @dec = map { |
166 | (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) | |
167 | : $Canon{$_} ? @{ $Canon{$_} } : $_ | |
168 | } @src; | |
8f118dcd JH |
169 | return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); |
170 | # condition @src == @dec is not ok. | |
ac5ea531 JH |
171 | } |
172 | ||
173 | sub getCompatList { | |
8f118dcd | 174 | my @src = @_; |
6c941e0c JH |
175 | my @dec = map { |
176 | (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) | |
177 | : $Compat{$_} ? @{ $Compat{$_} } : $_ | |
178 | } @src; | |
8f118dcd JH |
179 | return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); |
180 | # condition @src == @dec is not ok. | |
ac5ea531 JH |
181 | } |
182 | ||
48287974 TS |
183 | # exhaustive decomposition |
184 | foreach my $key (keys %Canon) { | |
185 | $Canon{$key} = [ getCanonList($key) ]; | |
186 | } | |
187 | ||
188 | # exhaustive decomposition | |
628bbff0 | 189 | foreach my $key (keys %Compat) { |
48287974 TS |
190 | $Compat{$key} = [ getCompatList($key) ]; |
191 | } | |
ac5ea531 | 192 | |
a96160d6 CBW |
193 | ##### The above part is common to mkheader and PP ##### |
194 | ||
a96160d6 CBW |
195 | foreach my $comp1st (keys %Compos) { |
196 | my $listname = sprintf("${structname}_%06x", $comp1st); | |
197 | # %04x is bad since it'd place _3046 after _1d157. | |
198 | $Comp1st{$comp1st} = $listname; | |
199 | my $rh1st = $Compos{$comp1st}; | |
200 | ||
201 | foreach my $comp2nd (keys %$rh1st) { | |
202 | my $uc = $rh1st->{$comp2nd}; | |
203 | $CompList{$listname}{$comp2nd} = $uc; | |
a96160d6 | 204 | } |
9f1f04a1 RGS |
205 | } |
206 | ||
2b8d773d RGS |
207 | sub split_into_char { |
208 | use bytes; | |
209 | my $uni = shift; | |
210 | my $len = length($uni); | |
211 | my @ary; | |
212 | for(my $i = 0; $i < $len; ++$i) { | |
213 | push @ary, ord(substr($uni,$i,1)); | |
214 | } | |
215 | return @ary; | |
216 | } | |
217 | ||
ac5ea531 | 218 | sub _U_stringify { |
8f118dcd | 219 | sprintf '"%s"', join '', |
a96160d6 | 220 | map sprintf("\\x%02x", $_), split_into_char(pack_U(@_)); |
ac5ea531 JH |
221 | } |
222 | ||
223 | foreach my $hash (\%Canon, \%Compat) { | |
8f118dcd JH |
224 | foreach my $key (keys %$hash) { |
225 | $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); | |
226 | } | |
ac5ea531 JH |
227 | } |
228 | ||
6c941e0c | 229 | ########## writing header files ########## |
ac5ea531 | 230 | |
8f118dcd JH |
231 | my @boolfunc = ( |
232 | { | |
233 | name => "Exclusion", | |
234 | type => "bool", | |
235 | hash => \%Exclus, | |
236 | }, | |
237 | { | |
238 | name => "Singleton", | |
239 | type => "bool", | |
240 | hash => \%Single, | |
241 | }, | |
242 | { | |
243 | name => "NonStDecomp", | |
244 | type => "bool", | |
245 | hash => \%NonStD, | |
246 | }, | |
247 | { | |
248 | name => "Comp2nd", | |
249 | type => "bool", | |
250 | hash => \%Comp2nd, | |
251 | }, | |
252 | ); | |
ac5ea531 JH |
253 | |
254 | my $file = "unfexc.h"; | |
255 | open FH, ">$file" or croak "$PACKAGE: $file can't be made"; | |
256 | binmode FH; select FH; | |
257 | ||
8f118dcd JH |
258 | print << 'EOF'; |
259 | /* | |
260 | * This file is auto-generated by mkheader. | |
261 | * Any changes here will be lost! | |
262 | */ | |
263 | EOF | |
ac5ea531 | 264 | |
8f118dcd JH |
265 | foreach my $tbl (@boolfunc) { |
266 | my @temp = sort {$a <=> $b} keys %{$tbl->{hash}}; | |
267 | my $type = $tbl->{type}; | |
268 | my $name = $tbl->{name}; | |
269 | print "$type is$name (UV uv)\n{\nreturn\n\t"; | |
270 | ||
271 | while (@temp) { | |
272 | my $cur = shift @temp; | |
273 | if (@temp && $cur + 1 == $temp[0]) { | |
274 | print "($cur <= uv && uv <= "; | |
275 | while (@temp && $cur + 1 == $temp[0]) { | |
276 | $cur = shift @temp; | |
277 | } | |
278 | print "$cur)"; | |
279 | print "\n\t|| " if @temp; | |
280 | } else { | |
281 | print "uv == $cur"; | |
282 | print "\n\t|| " if @temp; | |
283 | } | |
ac5ea531 | 284 | } |
8f118dcd | 285 | print "\n\t? TRUE : FALSE;\n}\n\n"; |
ac5ea531 JH |
286 | } |
287 | ||
ac5ea531 JH |
288 | close FH; |
289 | ||
290 | #################################### | |
291 | ||
48287974 TS |
292 | my $compinit = |
293 | "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; | |
294 | ||
295 | foreach my $i (sort keys %CompList) { | |
296 | $compinit .= "$structname $i [] = {\n"; | |
297 | $compinit .= join ",\n", | |
298 | map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), | |
299 | sort {$a <=> $b } keys %{ $CompList{$i} }; | |
300 | $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel | |
301 | } | |
302 | ||
ac5ea531 | 303 | my @tripletable = ( |
8f118dcd JH |
304 | { |
305 | file => "unfcmb", | |
306 | name => "combin", | |
307 | type => "STDCHAR", | |
308 | hash => \%Combin, | |
309 | null => 0, | |
310 | }, | |
311 | { | |
312 | file => "unfcan", | |
313 | name => "canon", | |
314 | type => "char*", | |
315 | hash => \%Canon, | |
316 | null => "NULL", | |
317 | }, | |
318 | { | |
319 | file => "unfcpt", | |
320 | name => "compat", | |
321 | type => "char*", | |
322 | hash => \%Compat, | |
323 | null => "NULL", | |
324 | }, | |
325 | { | |
326 | file => "unfcmp", | |
327 | name => "compos", | |
328 | type => "$structname *", | |
329 | hash => \%Comp1st, | |
330 | null => "NULL", | |
331 | init => $compinit, | |
332 | }, | |
ac5ea531 JH |
333 | ); |
334 | ||
335 | foreach my $tbl (@tripletable) { | |
8f118dcd JH |
336 | my $file = "$tbl->{file}.h"; |
337 | my $head = "${prefix}$tbl->{name}"; | |
338 | my $type = $tbl->{type}; | |
339 | my $hash = $tbl->{hash}; | |
340 | my $null = $tbl->{null}; | |
341 | my $init = $tbl->{init}; | |
342 | ||
343 | open FH, ">$file" or croak "$PACKAGE: $file can't be made"; | |
344 | binmode FH; select FH; | |
345 | my %val; | |
346 | ||
347 | print FH << 'EOF'; | |
ac5ea531 JH |
348 | /* |
349 | * This file is auto-generated by mkheader. | |
350 | * Any changes here will be lost! | |
351 | */ | |
352 | EOF | |
353 | ||
8f118dcd JH |
354 | print $init if defined $init; |
355 | ||
356 | foreach my $uv (keys %$hash) { | |
f027f502 JH |
357 | croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) |
358 | unless $uv <= 0x10FFFF; | |
8f118dcd JH |
359 | my @c = unpack 'CCCC', pack 'N', $uv; |
360 | $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; | |
361 | } | |
362 | ||
363 | foreach my $p (sort { $a <=> $b } keys %val) { | |
364 | next if ! $val{ $p }; | |
365 | for (my $r = 0; $r < 256; $r++) { | |
366 | next if ! $val{ $p }{ $r }; | |
fe067ad9 | 367 | printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r; |
8f118dcd JH |
368 | for (my $c = 0; $c < 256; $c++) { |
369 | print "\t", defined $val{$p}{$r}{$c} | |
370 | ? "($type)".$val{$p}{$r}{$c} | |
371 | : $null; | |
372 | print ',' if $c != 255; | |
373 | print "\n" if $c % 8 == 7; | |
374 | } | |
375 | print "};\n\n"; | |
376 | } | |
377 | } | |
378 | foreach my $p (sort { $a <=> $b } keys %val) { | |
379 | next if ! $val{ $p }; | |
fe067ad9 | 380 | printf "static $type* ${head}_%02x [256] = {\n", $p; |
8f118dcd JH |
381 | for (my $r = 0; $r < 256; $r++) { |
382 | print $val{ $p }{ $r } | |
383 | ? sprintf("${head}_%02x_%02x", $p, $r) | |
384 | : "NULL"; | |
385 | print ',' if $r != 255; | |
386 | print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; | |
387 | } | |
388 | print "};\n\n"; | |
ac5ea531 | 389 | } |
fe067ad9 | 390 | print "static $type** $head [] = {\n"; |
8f118dcd JH |
391 | for (my $p = 0; $p <= 0x10; $p++) { |
392 | print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; | |
393 | print ',' if $p != 0x10; | |
394 | print "\n"; | |
ac5ea531 JH |
395 | } |
396 | print "};\n\n"; | |
8f118dcd | 397 | close FH; |
ac5ea531 JH |
398 | } |
399 | ||
628bbff0 | 400 | 1; |
ac5ea531 | 401 | __END__ |