Commit | Line | Data |
---|---|---|
c6b7cc21 SH |
1 | #!perl |
2 | # | |
3 | # This auxiliary script makes five header files | |
4 | # used for building XSUB of Unicode::Normalize. | |
5 | # | |
6 | # Usage: | |
7 | # <do 'mkheader'> in perl, or <perl mkheader> in command line | |
8 | # | |
9 | # Input files: | |
10 | # unicore/CombiningClass.pl (or unicode/CombiningClass.pl) | |
11 | # unicore/Decomposition.pl (or unicode/Decomposition.pl) | |
12 | # | |
13 | # Output files: | |
14 | # unfcan.h | |
15 | # unfcpt.h | |
16 | # unfcmb.h | |
17 | # unfcmp.h | |
18 | # unfexc.h | |
19 | # | |
20 | use 5.006; | |
21 | use strict; | |
22 | use warnings; | |
23 | use Carp; | |
24 | use File::Spec; | |
25 | use SelectSaver; | |
26 | ||
27 | BEGIN { | |
28 | unless ('A' eq pack('U', 0x41)) { | |
29 | die "Unicode::Normalize cannot stringify a Unicode code point\n"; | |
30 | } | |
31 | unless (0x41 == unpack('U', 'A')) { | |
32 | die "Unicode::Normalize cannot get Unicode code point\n"; | |
33 | } | |
34 | } | |
35 | ||
36 | our $PACKAGE = 'Unicode::Normalize, mkheader'; | |
37 | ||
38 | our $prefix = "UNF_"; | |
39 | our $structname = "${prefix}complist"; | |
40 | ||
41 | # Starting in v5.20, the tables in lib/unicore are built using the platform's | |
42 | # native character set for code points 0-255. | |
43 | *pack_U = ($] ge 5.020) | |
44 | ? sub { return pack('W*', @_).pack('U*'); } # The empty pack returns | |
45 | # an empty UTF-8 string, | |
46 | # so the effect is to | |
47 | # force the return into | |
48 | # being UTF-8. | |
49 | : sub { return pack('U*', @_); }; | |
50 | ||
51 | # %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify() | |
52 | our %Comp1st; # $codepoint => $listname : may be composed with a next char. | |
53 | our %CompList; # $listname,$2nd => $codepoint : composite | |
54 | ||
55 | ##### The below part is common to mkheader and PP ##### | |
56 | ||
57 | our %Combin; # $codepoint => $number : combination class | |
58 | our %Canon; # $codepoint => \@codepoints : canonical decomp. | |
59 | our %Compat; # $codepoint => \@codepoints : compat. decomp. | |
60 | our %Compos; # $1st,$2nd => $codepoint : composite | |
61 | our %Exclus; # $codepoint => 1 : composition exclusions | |
62 | our %Single; # $codepoint => 1 : singletons | |
63 | our %NonStD; # $codepoint => 1 : non-starter decompositions | |
64 | our %Comp2nd; # $codepoint => 1 : may be composed with a prev char. | |
65 | ||
66 | # from core Unicode database | |
67 | our $Combin = do "unicore/CombiningClass.pl" | |
68 | || do "unicode/CombiningClass.pl" | |
69 | || croak "$PACKAGE: CombiningClass.pl not found"; | |
70 | our $Decomp = do "unicore/Decomposition.pl" | |
71 | || do "unicode/Decomposition.pl" | |
72 | || croak "$PACKAGE: Decomposition.pl not found"; | |
73 | ||
74 | # CompositionExclusions.txt since Unicode 3.2.0. If this ever changes, it | |
75 | # would be better to get the values from Unicode::UCD rather than hard-code | |
76 | # them here, as that will protect from having to make fixes for future | |
77 | # changes. | |
78 | our @CompEx = qw( | |
79 | 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36 | |
80 | 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76 | |
81 | 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D | |
82 | FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B | |
83 | FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C | |
84 | FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB | |
85 | 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0 | |
86 | ); | |
87 | ||
88 | # definition of Hangul constants | |
89 | use constant SBase => 0xAC00; | |
90 | use constant SFinal => 0xD7A3; # SBase -1 + SCount | |
91 | use constant SCount => 11172; # LCount * NCount | |
92 | use constant NCount => 588; # VCount * TCount | |
93 | use constant LBase => 0x1100; | |
94 | use constant LFinal => 0x1112; | |
95 | use constant LCount => 19; | |
96 | use constant VBase => 0x1161; | |
97 | use constant VFinal => 0x1175; | |
98 | use constant VCount => 21; | |
99 | use constant TBase => 0x11A7; | |
100 | use constant TFinal => 0x11C2; | |
101 | use constant TCount => 28; | |
102 | ||
103 | sub decomposeHangul { | |
104 | my $sindex = $_[0] - SBase; | |
105 | my $lindex = int( $sindex / NCount); | |
106 | my $vindex = int(($sindex % NCount) / TCount); | |
107 | my $tindex = $sindex % TCount; | |
108 | my @ret = ( | |
109 | LBase + $lindex, | |
110 | VBase + $vindex, | |
111 | $tindex ? (TBase + $tindex) : (), | |
112 | ); | |
113 | return wantarray ? @ret : pack_U(@ret); | |
114 | } | |
115 | ||
116 | ########## getting full decomposition ########## | |
117 | ||
118 | ## converts string "hhhh hhhh hhhh" to a numeric list | |
119 | ## (hex digits separated by spaces) | |
120 | sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g } | |
121 | ||
122 | while ($Combin =~ /(.+)/g) { | |
123 | my @tab = split /\t/, $1; | |
124 | my $ini = hex $tab[0]; | |
125 | if ($tab[1] eq '') { | |
126 | $Combin{$ini} = $tab[2]; | |
127 | } else { | |
128 | $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]); | |
129 | } | |
130 | } | |
131 | ||
132 | while ($Decomp =~ /(.+)/g) { | |
133 | my @tab = split /\t/, $1; | |
134 | my $compat = $tab[2] =~ s/<[^>]+>//; | |
135 | my $dec = [ _getHexArray($tab[2]) ]; # decomposition | |
136 | my $ini = hex($tab[0]); # initial decomposable character | |
137 | my $end = $tab[1] eq '' ? $ini : hex($tab[1]); | |
138 | # ($ini .. $end) is the range of decomposable characters. | |
139 | ||
140 | foreach my $u ($ini .. $end) { | |
141 | $Compat{$u} = $dec; | |
142 | $Canon{$u} = $dec if ! $compat; | |
143 | } | |
144 | } | |
145 | ||
146 | for my $s (@CompEx) { | |
147 | my $u = hex $s; | |
148 | next if !$Canon{$u}; # not assigned | |
149 | next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2 | |
150 | $Exclus{$u} = 1; | |
151 | } | |
152 | ||
153 | foreach my $u (keys %Canon) { | |
154 | my $dec = $Canon{$u}; | |
155 | ||
156 | if (@$dec == 2) { | |
157 | if ($Combin{ $dec->[0] }) { | |
158 | $NonStD{$u} = 1; | |
159 | } else { | |
160 | $Compos{ $dec->[0] }{ $dec->[1] } = $u; | |
161 | $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; | |
162 | } | |
163 | } elsif (@$dec == 1) { | |
164 | $Single{$u} = 1; | |
165 | } else { | |
166 | my $h = sprintf '%04X', $u; | |
167 | croak("Weird Canonical Decomposition of U+$h"); | |
168 | } | |
169 | } | |
170 | ||
171 | # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo | |
172 | foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { | |
173 | $Comp2nd{$j} = 1; | |
174 | } | |
175 | ||
176 | sub getCanonList { | |
177 | my @src = @_; | |
178 | my @dec = map { | |
179 | (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) | |
180 | : $Canon{$_} ? @{ $Canon{$_} } : $_ | |
181 | } @src; | |
182 | return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); | |
183 | # condition @src == @dec is not ok. | |
184 | } | |
185 | ||
186 | sub getCompatList { | |
187 | my @src = @_; | |
188 | my @dec = map { | |
189 | (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) | |
190 | : $Compat{$_} ? @{ $Compat{$_} } : $_ | |
191 | } @src; | |
192 | return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); | |
193 | # condition @src == @dec is not ok. | |
194 | } | |
195 | ||
196 | # exhaustive decomposition | |
197 | foreach my $key (keys %Canon) { | |
198 | $Canon{$key} = [ getCanonList($key) ]; | |
199 | } | |
200 | ||
201 | # exhaustive decomposition | |
202 | foreach my $key (keys %Compat) { | |
203 | $Compat{$key} = [ getCompatList($key) ]; | |
204 | } | |
205 | ||
206 | ##### The above part is common to mkheader and PP ##### | |
207 | ||
208 | foreach my $comp1st (keys %Compos) { | |
209 | my $listname = sprintf("${structname}_%06x", $comp1st); | |
210 | # %04x is bad since it'd place _3046 after _1d157. | |
211 | $Comp1st{$comp1st} = $listname; | |
212 | my $rh1st = $Compos{$comp1st}; | |
213 | ||
214 | foreach my $comp2nd (keys %$rh1st) { | |
215 | my $uc = $rh1st->{$comp2nd}; | |
216 | $CompList{$listname}{$comp2nd} = $uc; | |
217 | } | |
218 | } | |
219 | ||
220 | sub split_into_char { | |
221 | use bytes; | |
222 | my $uni = shift; | |
223 | my $len = length($uni); | |
224 | my @ary; | |
225 | for(my $i = 0; $i < $len; ++$i) { | |
226 | push @ary, ord(substr($uni,$i,1)); | |
227 | } | |
228 | return @ary; | |
229 | } | |
230 | ||
231 | sub _U_stringify { | |
232 | sprintf '"%s"', join '', | |
233 | map sprintf("\\x%02x", $_), split_into_char(pack_U(@_)); | |
234 | } | |
235 | ||
236 | foreach my $hash (\%Canon, \%Compat) { | |
237 | foreach my $key (keys %$hash) { | |
238 | $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); | |
239 | } | |
240 | } | |
241 | ||
242 | ########## writing header files ########## | |
243 | ||
244 | my @boolfunc = ( | |
245 | { | |
246 | name => "Exclusion", | |
247 | type => "bool", | |
248 | hash => \%Exclus, | |
249 | }, | |
250 | { | |
251 | name => "Singleton", | |
252 | type => "bool", | |
253 | hash => \%Single, | |
254 | }, | |
255 | { | |
256 | name => "NonStDecomp", | |
257 | type => "bool", | |
258 | hash => \%NonStD, | |
259 | }, | |
260 | { | |
261 | name => "Comp2nd", | |
262 | type => "bool", | |
263 | hash => \%Comp2nd, | |
264 | }, | |
265 | ); | |
266 | ||
267 | my $orig_fh = SelectSaver->new; | |
268 | { | |
269 | ||
270 | my $file = "unfexc.h"; | |
271 | open FH, ">$file" or croak "$PACKAGE: $file can't be made"; | |
272 | binmode FH; select FH; | |
273 | ||
274 | print << 'EOF'; | |
275 | /* | |
276 | * This file is auto-generated by mkheader. | |
277 | * Any changes here will be lost! | |
278 | */ | |
279 | EOF | |
280 | ||
281 | foreach my $tbl (@boolfunc) { | |
282 | my @temp = sort {$a <=> $b} keys %{$tbl->{hash}}; | |
283 | my $type = $tbl->{type}; | |
284 | my $name = $tbl->{name}; | |
285 | print "$type is$name (UV uv)\n{\nreturn\n\t"; | |
286 | ||
287 | while (@temp) { | |
288 | my $cur = shift @temp; | |
289 | if (@temp && $cur + 1 == $temp[0]) { | |
290 | print "($cur <= uv && uv <= "; | |
291 | while (@temp && $cur + 1 == $temp[0]) { | |
292 | $cur = shift @temp; | |
293 | } | |
294 | print "$cur)"; | |
295 | print "\n\t|| " if @temp; | |
296 | } else { | |
297 | print "uv == $cur"; | |
298 | print "\n\t|| " if @temp; | |
299 | } | |
300 | } | |
301 | print "\n\t? TRUE : FALSE;\n}\n\n"; | |
302 | } | |
303 | ||
304 | close FH; | |
305 | ||
306 | #################################### | |
307 | ||
308 | my $compinit = | |
309 | "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; | |
310 | ||
311 | foreach my $i (sort keys %CompList) { | |
312 | $compinit .= "$structname $i [] = {\n"; | |
313 | $compinit .= join ",\n", | |
314 | map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), | |
315 | sort {$a <=> $b } keys %{ $CompList{$i} }; | |
316 | $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel | |
317 | } | |
318 | ||
319 | my @tripletable = ( | |
320 | { | |
321 | file => "unfcmb", | |
322 | name => "combin", | |
323 | type => "STDCHAR", | |
324 | hash => \%Combin, | |
325 | null => 0, | |
326 | }, | |
327 | { | |
328 | file => "unfcan", | |
329 | name => "canon", | |
330 | type => "char*", | |
331 | hash => \%Canon, | |
332 | null => "NULL", | |
333 | }, | |
334 | { | |
335 | file => "unfcpt", | |
336 | name => "compat", | |
337 | type => "char*", | |
338 | hash => \%Compat, | |
339 | null => "NULL", | |
340 | }, | |
341 | { | |
342 | file => "unfcmp", | |
343 | name => "compos", | |
344 | type => "$structname *", | |
345 | hash => \%Comp1st, | |
346 | null => "NULL", | |
347 | init => $compinit, | |
348 | }, | |
349 | ); | |
350 | ||
351 | foreach my $tbl (@tripletable) { | |
352 | my $file = "$tbl->{file}.h"; | |
353 | my $head = "${prefix}$tbl->{name}"; | |
354 | my $type = $tbl->{type}; | |
355 | my $hash = $tbl->{hash}; | |
356 | my $null = $tbl->{null}; | |
357 | my $init = $tbl->{init}; | |
358 | ||
359 | open FH, ">$file" or croak "$PACKAGE: $file can't be made"; | |
360 | binmode FH; select FH; | |
361 | my %val; | |
362 | ||
363 | print FH << 'EOF'; | |
364 | /* | |
365 | * This file is auto-generated by mkheader. | |
366 | * Any changes here will be lost! | |
367 | */ | |
368 | EOF | |
369 | ||
370 | print $init if defined $init; | |
371 | ||
372 | foreach my $uv (keys %$hash) { | |
373 | croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) | |
374 | unless $uv <= 0x10FFFF; | |
375 | my @c = unpack 'CCCC', pack 'N', $uv; | |
376 | $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; | |
377 | } | |
378 | ||
379 | foreach my $p (sort { $a <=> $b } keys %val) { | |
380 | next if ! $val{ $p }; | |
381 | for (my $r = 0; $r < 256; $r++) { | |
382 | next if ! $val{ $p }{ $r }; | |
383 | printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r; | |
384 | for (my $c = 0; $c < 256; $c++) { | |
385 | print "\t", defined $val{$p}{$r}{$c} | |
386 | ? "($type)".$val{$p}{$r}{$c} | |
387 | : $null; | |
388 | print ',' if $c != 255; | |
389 | print "\n" if $c % 8 == 7; | |
390 | } | |
391 | print "};\n\n"; | |
392 | } | |
393 | } | |
394 | foreach my $p (sort { $a <=> $b } keys %val) { | |
395 | next if ! $val{ $p }; | |
396 | printf "static $type* ${head}_%02x [256] = {\n", $p; | |
397 | for (my $r = 0; $r < 256; $r++) { | |
398 | print $val{ $p }{ $r } | |
399 | ? sprintf("${head}_%02x_%02x", $p, $r) | |
400 | : "NULL"; | |
401 | print ',' if $r != 255; | |
402 | print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; | |
403 | } | |
404 | print "};\n\n"; | |
405 | } | |
406 | print "static $type** $head [] = {\n"; | |
407 | for (my $p = 0; $p <= 0x10; $p++) { | |
408 | print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; | |
409 | print ',' if $p != 0x10; | |
410 | print "\n"; | |
411 | } | |
412 | print "};\n\n"; | |
413 | close FH; | |
414 | } | |
415 | ||
416 | } # End of block for SelectSaver | |
417 | ||
418 | 1; | |
419 | __END__ |