Commit | Line | Data |
---|---|---|
ac5ea531 JH |
1 | #!perl |
2 | # | |
628bbff0 RGS |
3 | # This auxiliary script makes five header files |
4 | # used for building XSUB of Unicode::Normalize. | |
ac5ea531 | 5 | # |
628bbff0 RGS |
6 | # Usage: |
7 | # <do 'mkheader'> in perl, or <perl mkheader> in command line | |
8 | # | |
9 | # Input files: | |
10 | # unicore/CombiningClass.pl (or unicode/CombiningClass.pl) | |
11 | # unicore/Decomposition.pl (or unicode/Decomposition.pl) | |
12 | # unicore/CompositionExclusions.txt (or unicode/CompExcl.txt) | |
13 | # | |
14 | # Output files: | |
15 | # unfcan.h | |
16 | # unfcpt.h | |
17 | # unfcmb.h | |
18 | # unfcmp.h | |
19 | # unfexc.h | |
f027f502 | 20 | # |
ac5ea531 JH |
21 | use 5.006; |
22 | use strict; | |
23 | use warnings; | |
24 | use Carp; | |
6c941e0c JH |
25 | use File::Spec; |
26 | ||
9f1f04a1 | 27 | BEGIN { |
1efaba7f | 28 | unless ("A" eq pack('U', 0x41)) { |
9f1f04a1 RGS |
29 | die "Unicode::Normalize cannot stringify a Unicode code point\n"; |
30 | } | |
31 | } | |
ac5ea531 JH |
32 | |
33 | our $PACKAGE = 'Unicode::Normalize, mkheader'; | |
34 | ||
35 | our $Combin = do "unicore/CombiningClass.pl" | |
8f118dcd JH |
36 | || do "unicode/CombiningClass.pl" |
37 | || croak "$PACKAGE: CombiningClass.pl not found"; | |
ac5ea531 JH |
38 | |
39 | our $Decomp = do "unicore/Decomposition.pl" | |
8f118dcd JH |
40 | || do "unicode/Decomposition.pl" |
41 | || croak "$PACKAGE: Decomposition.pl not found"; | |
ac5ea531 | 42 | |
48287974 | 43 | our %Combin; # $codepoint => $number : combination class |
6c941e0c JH |
44 | our %Canon; # $codepoint => \@codepoints : canonical decomp. |
45 | our %Compat; # $codepoint => \@codepoints : compat. decomp. | |
46 | # after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat | |
48287974 TS |
47 | our %Exclus; # $codepoint => 1 : composition exclusions |
48 | our %Single; # $codepoint => 1 : singletons | |
49 | our %NonStD; # $codepoint => 1 : non-starter decompositions | |
50 | ||
51 | our %Comp1st; # $codepoint => $listname : may be composed with a next char. | |
52 | our %Comp2nd; # $codepoint => 1 : may be composed with a prev char. | |
53 | our %CompList; # $listname,$2nd => $codepoint : composite | |
54 | ||
55 | our $prefix = "UNF_"; | |
56 | our $structname = "${prefix}complist"; | |
ac5ea531 | 57 | |
6c941e0c JH |
58 | ########## definition of Hangul constants ########## |
59 | use constant SBase => 0xAC00; | |
60 | use constant SFinal => 0xD7A3; # SBase -1 + SCount | |
61 | use constant SCount => 11172; # LCount * NCount | |
62 | use constant NCount => 588; # VCount * TCount | |
63 | use constant LBase => 0x1100; | |
64 | use constant LFinal => 0x1112; | |
65 | use constant LCount => 19; | |
66 | use constant VBase => 0x1161; | |
67 | use constant VFinal => 0x1175; | |
68 | use constant VCount => 21; | |
69 | use constant TBase => 0x11A7; | |
70 | use constant TFinal => 0x11C2; | |
71 | use constant TCount => 28; | |
72 | ||
73 | sub decomposeHangul { | |
74 | my $SIndex = $_[0] - SBase; | |
75 | my $LIndex = int( $SIndex / NCount); | |
76 | my $VIndex = int(($SIndex % NCount) / TCount); | |
77 | my $TIndex = $SIndex % TCount; | |
78 | my @ret = ( | |
79 | LBase + $LIndex, | |
80 | VBase + $VIndex, | |
81 | $TIndex ? (TBase + $TIndex) : (), | |
82 | ); | |
2b8d773d | 83 | return @ret; |
6c941e0c JH |
84 | } |
85 | ||
51683ce6 TS |
86 | ########## length of a character ########## |
87 | ||
88 | sub utf8len { | |
89 | my $uv = shift; | |
90 | return $uv < 0x80 ? 1 : | |
91 | $uv < 0x800 ? 2 : | |
92 | $uv < 0x10000 ? 3 : | |
93 | $uv < 0x110000 ? 4 : | |
94 | croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff."; | |
95 | } | |
96 | ||
97 | sub utfelen { | |
98 | my $uv = shift; | |
99 | return $uv < 0xA0 ? 1 : | |
100 | $uv < 0x400 ? 2 : | |
101 | $uv < 0x4000 ? 3 : | |
102 | $uv < 0x40000 ? 4 : | |
103 | $uv < 0x110000 ? 5 : | |
104 | croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff."; | |
105 | } | |
106 | ||
107 | my $errExpand = "$PACKAGE: Composition to U+%04X (from U+%04X and U+%04X) " . | |
108 | "needs growing the string in %s! Quit. Please inform the author..."; | |
109 | ||
6c941e0c | 110 | ########## getting full decomposion ########## |
ac5ea531 | 111 | { |
8f118dcd JH |
112 | my($f, $fh); |
113 | foreach my $d (@INC) { | |
8f118dcd JH |
114 | $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt"); |
115 | last if open($fh, $f); | |
116 | $f = File::Spec->catfile($d, "unicode", "CompExcl.txt"); | |
117 | last if open($fh, $f); | |
118 | $f = undef; | |
119 | } | |
48287974 TS |
120 | croak "$PACKAGE: neither unicore/CompositionExclusions.txt " |
121 | . "nor unicode/CompExcl.txt is found in @INC" unless defined $f; | |
122 | ||
123 | while (<$fh>) { | |
124 | next if /^#/ or /^$/; | |
125 | s/#.*//; | |
126 | $Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/; | |
127 | } | |
8f118dcd | 128 | close $fh; |
ac5ea531 JH |
129 | } |
130 | ||
48287974 TS |
131 | ## |
132 | ## converts string "hhhh hhhh hhhh" to a numeric list | |
133 | ## | |
134 | sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g } | |
135 | ||
8f118dcd JH |
136 | while ($Combin =~ /(.+)/g) { |
137 | my @tab = split /\t/, $1; | |
138 | my $ini = hex $tab[0]; | |
139 | if ($tab[1] eq '') { | |
51683ce6 | 140 | $Combin{$ini} = $tab[2]; |
8f118dcd | 141 | } else { |
51683ce6 | 142 | $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]); |
8f118dcd | 143 | } |
ac5ea531 JH |
144 | } |
145 | ||
8f118dcd JH |
146 | while ($Decomp =~ /(.+)/g) { |
147 | my @tab = split /\t/, $1; | |
148 | my $compat = $tab[2] =~ s/<[^>]+>//; | |
149 | my $dec = [ _getHexArray($tab[2]) ]; # decomposition | |
51683ce6 TS |
150 | my $ini = hex($tab[0]); |
151 | my $end = $tab[1] eq '' ? $ini : hex($tab[1]); | |
152 | # ($ini .. $end) is the range of decomposable characters. | |
48287974 TS |
153 | |
154 | my $listname = | |
155 | @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS'; | |
156 | # %04x is bad since it'd place _3046 after _1d157. | |
157 | ||
51683ce6 TS |
158 | foreach my $u ($ini .. $end) { |
159 | $Compat{$u} = $dec; | |
8f118dcd JH |
160 | |
161 | if (! $compat) { | |
51683ce6 | 162 | $Canon{$u} = $dec; |
8f118dcd | 163 | |
f027f502 | 164 | if (@$dec == 2) { |
51683ce6 TS |
165 | if (utf8len($dec->[0]) + utf8len($dec->[1]) < utf8len($u)) { |
166 | croak sprintf $errExpand, $u, $dec->[0], $dec->[1], | |
167 | "utf-8"; | |
168 | } | |
169 | if (utfelen($dec->[0]) + utfelen($dec->[1]) < utfelen($u)) { | |
170 | croak sprintf $errExpand, $u, $dec->[0], $dec->[1], | |
171 | "utf-ebcdic"; | |
172 | } | |
173 | ||
8f118dcd | 174 | if ($Combin{ $dec->[0] }) { |
51683ce6 | 175 | $NonStD{$u} = 1; |
8f118dcd | 176 | } else { |
51683ce6 | 177 | $CompList{ $listname }{ $dec->[1] } = $u; |
48287974 | 178 | $Comp1st{ $dec->[0] } = $listname; |
51683ce6 | 179 | $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; |
8f118dcd | 180 | } |
f027f502 | 181 | } elsif (@$dec == 1) { |
51683ce6 | 182 | $Single{$u} = 1; |
f027f502 JH |
183 | } else { |
184 | croak("Weird Canonical Decomposition of U+$tab[0]"); | |
8f118dcd | 185 | } |
8f118dcd | 186 | } |
ac5ea531 | 187 | } |
ac5ea531 JH |
188 | } |
189 | ||
48287974 TS |
190 | # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo |
191 | foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { | |
192 | $Comp2nd{$j} = 1; | |
ac5ea531 JH |
193 | } |
194 | ||
195 | sub getCanonList { | |
8f118dcd | 196 | my @src = @_; |
6c941e0c JH |
197 | my @dec = map { |
198 | (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) | |
199 | : $Canon{$_} ? @{ $Canon{$_} } : $_ | |
200 | } @src; | |
8f118dcd JH |
201 | return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); |
202 | # condition @src == @dec is not ok. | |
ac5ea531 JH |
203 | } |
204 | ||
205 | sub getCompatList { | |
8f118dcd | 206 | my @src = @_; |
6c941e0c JH |
207 | my @dec = map { |
208 | (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) | |
209 | : $Compat{$_} ? @{ $Compat{$_} } : $_ | |
210 | } @src; | |
8f118dcd JH |
211 | return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); |
212 | # condition @src == @dec is not ok. | |
ac5ea531 JH |
213 | } |
214 | ||
48287974 TS |
215 | # exhaustive decomposition |
216 | foreach my $key (keys %Canon) { | |
217 | $Canon{$key} = [ getCanonList($key) ]; | |
218 | } | |
219 | ||
220 | # exhaustive decomposition | |
628bbff0 | 221 | foreach my $key (keys %Compat) { |
48287974 TS |
222 | $Compat{$key} = [ getCompatList($key) ]; |
223 | } | |
ac5ea531 | 224 | |
9f1f04a1 | 225 | sub _pack_U { |
1efaba7f | 226 | return pack('U*', @_); |
9f1f04a1 RGS |
227 | } |
228 | ||
2b8d773d RGS |
229 | sub split_into_char { |
230 | use bytes; | |
231 | my $uni = shift; | |
232 | my $len = length($uni); | |
233 | my @ary; | |
234 | for(my $i = 0; $i < $len; ++$i) { | |
235 | push @ary, ord(substr($uni,$i,1)); | |
236 | } | |
237 | return @ary; | |
238 | } | |
239 | ||
ac5ea531 | 240 | sub _U_stringify { |
8f118dcd | 241 | sprintf '"%s"', join '', |
2b8d773d | 242 | map sprintf("\\x%02x", $_), split_into_char(_pack_U(@_)); |
ac5ea531 JH |
243 | } |
244 | ||
245 | foreach my $hash (\%Canon, \%Compat) { | |
8f118dcd JH |
246 | foreach my $key (keys %$hash) { |
247 | $hash->{$key} = _U_stringify( @{ $hash->{$key} } ); | |
248 | } | |
ac5ea531 JH |
249 | } |
250 | ||
6c941e0c | 251 | ########## writing header files ########## |
ac5ea531 | 252 | |
8f118dcd JH |
253 | my @boolfunc = ( |
254 | { | |
255 | name => "Exclusion", | |
256 | type => "bool", | |
257 | hash => \%Exclus, | |
258 | }, | |
259 | { | |
260 | name => "Singleton", | |
261 | type => "bool", | |
262 | hash => \%Single, | |
263 | }, | |
264 | { | |
265 | name => "NonStDecomp", | |
266 | type => "bool", | |
267 | hash => \%NonStD, | |
268 | }, | |
269 | { | |
270 | name => "Comp2nd", | |
271 | type => "bool", | |
272 | hash => \%Comp2nd, | |
273 | }, | |
274 | ); | |
ac5ea531 JH |
275 | |
276 | my $file = "unfexc.h"; | |
277 | open FH, ">$file" or croak "$PACKAGE: $file can't be made"; | |
278 | binmode FH; select FH; | |
279 | ||
8f118dcd JH |
280 | print << 'EOF'; |
281 | /* | |
282 | * This file is auto-generated by mkheader. | |
283 | * Any changes here will be lost! | |
284 | */ | |
285 | EOF | |
ac5ea531 | 286 | |
8f118dcd JH |
287 | foreach my $tbl (@boolfunc) { |
288 | my @temp = sort {$a <=> $b} keys %{$tbl->{hash}}; | |
289 | my $type = $tbl->{type}; | |
290 | my $name = $tbl->{name}; | |
291 | print "$type is$name (UV uv)\n{\nreturn\n\t"; | |
292 | ||
293 | while (@temp) { | |
294 | my $cur = shift @temp; | |
295 | if (@temp && $cur + 1 == $temp[0]) { | |
296 | print "($cur <= uv && uv <= "; | |
297 | while (@temp && $cur + 1 == $temp[0]) { | |
298 | $cur = shift @temp; | |
299 | } | |
300 | print "$cur)"; | |
301 | print "\n\t|| " if @temp; | |
302 | } else { | |
303 | print "uv == $cur"; | |
304 | print "\n\t|| " if @temp; | |
305 | } | |
ac5ea531 | 306 | } |
8f118dcd | 307 | print "\n\t? TRUE : FALSE;\n}\n\n"; |
ac5ea531 JH |
308 | } |
309 | ||
ac5ea531 JH |
310 | close FH; |
311 | ||
312 | #################################### | |
313 | ||
48287974 TS |
314 | my $compinit = |
315 | "typedef struct { UV nextchar; UV composite; } $structname;\n\n"; | |
316 | ||
317 | foreach my $i (sort keys %CompList) { | |
318 | $compinit .= "$structname $i [] = {\n"; | |
319 | $compinit .= join ",\n", | |
320 | map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}), | |
321 | sort {$a <=> $b } keys %{ $CompList{$i} }; | |
322 | $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel | |
323 | } | |
324 | ||
ac5ea531 | 325 | my @tripletable = ( |
8f118dcd JH |
326 | { |
327 | file => "unfcmb", | |
328 | name => "combin", | |
329 | type => "STDCHAR", | |
330 | hash => \%Combin, | |
331 | null => 0, | |
332 | }, | |
333 | { | |
334 | file => "unfcan", | |
335 | name => "canon", | |
336 | type => "char*", | |
337 | hash => \%Canon, | |
338 | null => "NULL", | |
339 | }, | |
340 | { | |
341 | file => "unfcpt", | |
342 | name => "compat", | |
343 | type => "char*", | |
344 | hash => \%Compat, | |
345 | null => "NULL", | |
346 | }, | |
347 | { | |
348 | file => "unfcmp", | |
349 | name => "compos", | |
350 | type => "$structname *", | |
351 | hash => \%Comp1st, | |
352 | null => "NULL", | |
353 | init => $compinit, | |
354 | }, | |
ac5ea531 JH |
355 | ); |
356 | ||
357 | foreach my $tbl (@tripletable) { | |
8f118dcd JH |
358 | my $file = "$tbl->{file}.h"; |
359 | my $head = "${prefix}$tbl->{name}"; | |
360 | my $type = $tbl->{type}; | |
361 | my $hash = $tbl->{hash}; | |
362 | my $null = $tbl->{null}; | |
363 | my $init = $tbl->{init}; | |
364 | ||
365 | open FH, ">$file" or croak "$PACKAGE: $file can't be made"; | |
366 | binmode FH; select FH; | |
367 | my %val; | |
368 | ||
369 | print FH << 'EOF'; | |
ac5ea531 JH |
370 | /* |
371 | * This file is auto-generated by mkheader. | |
372 | * Any changes here will be lost! | |
373 | */ | |
374 | EOF | |
375 | ||
8f118dcd JH |
376 | print $init if defined $init; |
377 | ||
378 | foreach my $uv (keys %$hash) { | |
f027f502 JH |
379 | croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) |
380 | unless $uv <= 0x10FFFF; | |
8f118dcd JH |
381 | my @c = unpack 'CCCC', pack 'N', $uv; |
382 | $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; | |
383 | } | |
384 | ||
385 | foreach my $p (sort { $a <=> $b } keys %val) { | |
386 | next if ! $val{ $p }; | |
387 | for (my $r = 0; $r < 256; $r++) { | |
388 | next if ! $val{ $p }{ $r }; | |
fe067ad9 | 389 | printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r; |
8f118dcd JH |
390 | for (my $c = 0; $c < 256; $c++) { |
391 | print "\t", defined $val{$p}{$r}{$c} | |
392 | ? "($type)".$val{$p}{$r}{$c} | |
393 | : $null; | |
394 | print ',' if $c != 255; | |
395 | print "\n" if $c % 8 == 7; | |
396 | } | |
397 | print "};\n\n"; | |
398 | } | |
399 | } | |
400 | foreach my $p (sort { $a <=> $b } keys %val) { | |
401 | next if ! $val{ $p }; | |
fe067ad9 | 402 | printf "static $type* ${head}_%02x [256] = {\n", $p; |
8f118dcd JH |
403 | for (my $r = 0; $r < 256; $r++) { |
404 | print $val{ $p }{ $r } | |
405 | ? sprintf("${head}_%02x_%02x", $p, $r) | |
406 | : "NULL"; | |
407 | print ',' if $r != 255; | |
408 | print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; | |
409 | } | |
410 | print "};\n\n"; | |
ac5ea531 | 411 | } |
fe067ad9 | 412 | print "static $type** $head [] = {\n"; |
8f118dcd JH |
413 | for (my $p = 0; $p <= 0x10; $p++) { |
414 | print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; | |
415 | print ',' if $p != 0x10; | |
416 | print "\n"; | |
ac5ea531 JH |
417 | } |
418 | print "};\n\n"; | |
8f118dcd | 419 | close FH; |
ac5ea531 JH |
420 | } |
421 | ||
628bbff0 | 422 | 1; |
ac5ea531 | 423 | __END__ |