This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade Time::Piece from 1.21 to 1.22
[perl5.git] / cpan / Unicode-Normalize / mkheader
CommitLineData
ac5ea531
JH
1#!perl
2#
628bbff0
RGS
3# This auxiliary script makes five header files
4# used for building XSUB of Unicode::Normalize.
ac5ea531 5#
628bbff0
RGS
6# Usage:
7# <do 'mkheader'> in perl, or <perl mkheader> in command line
8#
9# Input files:
10# unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
11# unicore/Decomposition.pl (or unicode/Decomposition.pl)
628bbff0
RGS
12#
13# Output files:
14# unfcan.h
15# unfcpt.h
16# unfcmb.h
17# unfcmp.h
18# unfexc.h
f027f502 19#
ac5ea531
JH
20use 5.006;
21use strict;
22use warnings;
23use Carp;
6c941e0c
JH
24use File::Spec;
25
9f1f04a1 26BEGIN {
1efaba7f 27 unless ("A" eq pack('U', 0x41)) {
9f1f04a1
RGS
28 die "Unicode::Normalize cannot stringify a Unicode code point\n";
29 }
30}
ac5ea531
JH
31
32our $PACKAGE = 'Unicode::Normalize, mkheader';
33
a96160d6
CBW
34our $prefix = "UNF_";
35our $structname = "${prefix}complist";
ac5ea531 36
a96160d6
CBW
37sub pack_U {
38 return pack('U*', @_);
39}
40
41# %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify()
42our %Comp1st; # $codepoint => $listname : may be composed with a next char.
43our %CompList; # $listname,$2nd => $codepoint : composite
44
45##### The below part is common to mkheader and PP #####
ac5ea531 46
48287974 47our %Combin; # $codepoint => $number : combination class
6c941e0c
JH
48our %Canon; # $codepoint => \@codepoints : canonical decomp.
49our %Compat; # $codepoint => \@codepoints : compat. decomp.
a96160d6 50our %Compos; # $1st,$2nd => $codepoint : composite
48287974
TS
51our %Exclus; # $codepoint => 1 : composition exclusions
52our %Single; # $codepoint => 1 : singletons
53our %NonStD; # $codepoint => 1 : non-starter decompositions
48287974 54our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
48287974 55
e0a65de5 56# from core Unicode database
a96160d6
CBW
57our $Combin = do "unicore/CombiningClass.pl"
58 || do "unicode/CombiningClass.pl"
59 || croak "$PACKAGE: CombiningClass.pl not found";
60our $Decomp = do "unicore/Decomposition.pl"
61 || do "unicode/Decomposition.pl"
62 || croak "$PACKAGE: Decomposition.pl not found";
ac5ea531 63
e0a65de5
CBW
64# CompositionExclusions.txt since Unicode 3.2.0
65our @CompEx = qw(
66 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
67 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
68 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
69 FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
70 FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
71 FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
72 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
73);
74
a96160d6 75# definition of Hangul constants
6c941e0c
JH
76use constant SBase => 0xAC00;
77use constant SFinal => 0xD7A3; # SBase -1 + SCount
78use constant SCount => 11172; # LCount * NCount
79use constant NCount => 588; # VCount * TCount
80use constant LBase => 0x1100;
81use constant LFinal => 0x1112;
82use constant LCount => 19;
83use constant VBase => 0x1161;
84use constant VFinal => 0x1175;
85use constant VCount => 21;
86use constant TBase => 0x11A7;
87use constant TFinal => 0x11C2;
88use constant TCount => 28;
89
90sub decomposeHangul {
a96160d6
CBW
91 my $sindex = $_[0] - SBase;
92 my $lindex = int( $sindex / NCount);
93 my $vindex = int(($sindex % NCount) / TCount);
94 my $tindex = $sindex % TCount;
6c941e0c 95 my @ret = (
a96160d6
CBW
96 LBase + $lindex,
97 VBase + $vindex,
98 $tindex ? (TBase + $tindex) : (),
6c941e0c 99 );
a96160d6 100 return wantarray ? @ret : pack_U(@ret);
51683ce6
TS
101}
102
788ba0f4 103########## getting full decomposition ##########
ac5ea531 104
48287974 105## converts string "hhhh hhhh hhhh" to a numeric list
a96160d6
CBW
106## (hex digits separated by spaces)
107sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
48287974 108
8f118dcd
JH
109while ($Combin =~ /(.+)/g) {
110 my @tab = split /\t/, $1;
111 my $ini = hex $tab[0];
112 if ($tab[1] eq '') {
51683ce6 113 $Combin{$ini} = $tab[2];
8f118dcd 114 } else {
51683ce6 115 $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
8f118dcd 116 }
ac5ea531
JH
117}
118
8f118dcd
JH
119while ($Decomp =~ /(.+)/g) {
120 my @tab = split /\t/, $1;
121 my $compat = $tab[2] =~ s/<[^>]+>//;
122 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
a96160d6 123 my $ini = hex($tab[0]); # initial decomposable character
51683ce6
TS
124 my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
125 # ($ini .. $end) is the range of decomposable characters.
48287974 126
51683ce6
TS
127 foreach my $u ($ini .. $end) {
128 $Compat{$u} = $dec;
e0a65de5
CBW
129 $Canon{$u} = $dec if ! $compat;
130 }
131}
132
133for my $s (@CompEx) {
134 my $u = hex $s;
135 next if !$Canon{$u}; # not assigned
136 next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
137 $Exclus{$u} = 1;
138}
8f118dcd 139
e0a65de5
CBW
140foreach my $u (keys %Canon) {
141 my $dec = $Canon{$u};
142
143 if (@$dec == 2) {
144 if ($Combin{ $dec->[0] }) {
145 $NonStD{$u} = 1;
146 } else {
147 $Compos{ $dec->[0] }{ $dec->[1] } = $u;
148 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
8f118dcd 149 }
e0a65de5
CBW
150 } elsif (@$dec == 1) {
151 $Single{$u} = 1;
152 } else {
153 my $h = sprintf '%04X', $u;
154 croak("Weird Canonical Decomposition of U+$h");
ac5ea531 155 }
ac5ea531
JH
156}
157
48287974
TS
158# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
159foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
160 $Comp2nd{$j} = 1;
ac5ea531
JH
161}
162
163sub getCanonList {
8f118dcd 164 my @src = @_;
6c941e0c
JH
165 my @dec = map {
166 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
167 : $Canon{$_} ? @{ $Canon{$_} } : $_
168 } @src;
8f118dcd
JH
169 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
170 # condition @src == @dec is not ok.
ac5ea531
JH
171}
172
173sub getCompatList {
8f118dcd 174 my @src = @_;
6c941e0c
JH
175 my @dec = map {
176 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
177 : $Compat{$_} ? @{ $Compat{$_} } : $_
178 } @src;
8f118dcd
JH
179 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
180 # condition @src == @dec is not ok.
ac5ea531
JH
181}
182
48287974
TS
183# exhaustive decomposition
184foreach my $key (keys %Canon) {
185 $Canon{$key} = [ getCanonList($key) ];
186}
187
188# exhaustive decomposition
628bbff0 189foreach my $key (keys %Compat) {
48287974
TS
190 $Compat{$key} = [ getCompatList($key) ];
191}
ac5ea531 192
a96160d6
CBW
193##### The above part is common to mkheader and PP #####
194
a96160d6
CBW
195foreach my $comp1st (keys %Compos) {
196 my $listname = sprintf("${structname}_%06x", $comp1st);
197 # %04x is bad since it'd place _3046 after _1d157.
198 $Comp1st{$comp1st} = $listname;
199 my $rh1st = $Compos{$comp1st};
200
201 foreach my $comp2nd (keys %$rh1st) {
202 my $uc = $rh1st->{$comp2nd};
203 $CompList{$listname}{$comp2nd} = $uc;
a96160d6 204 }
9f1f04a1
RGS
205}
206
2b8d773d
RGS
207sub split_into_char {
208 use bytes;
209 my $uni = shift;
210 my $len = length($uni);
211 my @ary;
212 for(my $i = 0; $i < $len; ++$i) {
213 push @ary, ord(substr($uni,$i,1));
214 }
215 return @ary;
216}
217
ac5ea531 218sub _U_stringify {
8f118dcd 219 sprintf '"%s"', join '',
a96160d6 220 map sprintf("\\x%02x", $_), split_into_char(pack_U(@_));
ac5ea531
JH
221}
222
223foreach my $hash (\%Canon, \%Compat) {
8f118dcd
JH
224 foreach my $key (keys %$hash) {
225 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
226 }
ac5ea531
JH
227}
228
6c941e0c 229########## writing header files ##########
ac5ea531 230
8f118dcd
JH
231my @boolfunc = (
232 {
233 name => "Exclusion",
234 type => "bool",
235 hash => \%Exclus,
236 },
237 {
238 name => "Singleton",
239 type => "bool",
240 hash => \%Single,
241 },
242 {
243 name => "NonStDecomp",
244 type => "bool",
245 hash => \%NonStD,
246 },
247 {
248 name => "Comp2nd",
249 type => "bool",
250 hash => \%Comp2nd,
251 },
252);
ac5ea531
JH
253
254my $file = "unfexc.h";
255open FH, ">$file" or croak "$PACKAGE: $file can't be made";
256binmode FH; select FH;
257
8f118dcd
JH
258 print << 'EOF';
259/*
260 * This file is auto-generated by mkheader.
261 * Any changes here will be lost!
262 */
263EOF
ac5ea531 264
8f118dcd
JH
265foreach my $tbl (@boolfunc) {
266 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
267 my $type = $tbl->{type};
268 my $name = $tbl->{name};
269 print "$type is$name (UV uv)\n{\nreturn\n\t";
270
271 while (@temp) {
272 my $cur = shift @temp;
273 if (@temp && $cur + 1 == $temp[0]) {
274 print "($cur <= uv && uv <= ";
275 while (@temp && $cur + 1 == $temp[0]) {
276 $cur = shift @temp;
277 }
278 print "$cur)";
279 print "\n\t|| " if @temp;
280 } else {
281 print "uv == $cur";
282 print "\n\t|| " if @temp;
283 }
ac5ea531 284 }
8f118dcd 285 print "\n\t? TRUE : FALSE;\n}\n\n";
ac5ea531
JH
286}
287
ac5ea531
JH
288close FH;
289
290####################################
291
48287974
TS
292my $compinit =
293 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
294
295foreach my $i (sort keys %CompList) {
296 $compinit .= "$structname $i [] = {\n";
297 $compinit .= join ",\n",
298 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
299 sort {$a <=> $b } keys %{ $CompList{$i} };
300 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
301}
302
ac5ea531 303my @tripletable = (
8f118dcd
JH
304 {
305 file => "unfcmb",
306 name => "combin",
307 type => "STDCHAR",
308 hash => \%Combin,
309 null => 0,
310 },
311 {
312 file => "unfcan",
313 name => "canon",
314 type => "char*",
315 hash => \%Canon,
316 null => "NULL",
317 },
318 {
319 file => "unfcpt",
320 name => "compat",
321 type => "char*",
322 hash => \%Compat,
323 null => "NULL",
324 },
325 {
326 file => "unfcmp",
327 name => "compos",
328 type => "$structname *",
329 hash => \%Comp1st,
330 null => "NULL",
331 init => $compinit,
332 },
ac5ea531
JH
333);
334
335foreach my $tbl (@tripletable) {
8f118dcd
JH
336 my $file = "$tbl->{file}.h";
337 my $head = "${prefix}$tbl->{name}";
338 my $type = $tbl->{type};
339 my $hash = $tbl->{hash};
340 my $null = $tbl->{null};
341 my $init = $tbl->{init};
342
343 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
344 binmode FH; select FH;
345 my %val;
346
347 print FH << 'EOF';
ac5ea531
JH
348/*
349 * This file is auto-generated by mkheader.
350 * Any changes here will be lost!
351 */
352EOF
353
8f118dcd
JH
354 print $init if defined $init;
355
356 foreach my $uv (keys %$hash) {
f027f502
JH
357 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
358 unless $uv <= 0x10FFFF;
8f118dcd
JH
359 my @c = unpack 'CCCC', pack 'N', $uv;
360 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
361 }
362
363 foreach my $p (sort { $a <=> $b } keys %val) {
364 next if ! $val{ $p };
365 for (my $r = 0; $r < 256; $r++) {
366 next if ! $val{ $p }{ $r };
fe067ad9 367 printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
8f118dcd
JH
368 for (my $c = 0; $c < 256; $c++) {
369 print "\t", defined $val{$p}{$r}{$c}
370 ? "($type)".$val{$p}{$r}{$c}
371 : $null;
372 print ',' if $c != 255;
373 print "\n" if $c % 8 == 7;
374 }
375 print "};\n\n";
376 }
377 }
378 foreach my $p (sort { $a <=> $b } keys %val) {
379 next if ! $val{ $p };
fe067ad9 380 printf "static $type* ${head}_%02x [256] = {\n", $p;
8f118dcd
JH
381 for (my $r = 0; $r < 256; $r++) {
382 print $val{ $p }{ $r }
383 ? sprintf("${head}_%02x_%02x", $p, $r)
384 : "NULL";
385 print ',' if $r != 255;
386 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
387 }
388 print "};\n\n";
ac5ea531 389 }
fe067ad9 390 print "static $type** $head [] = {\n";
8f118dcd
JH
391 for (my $p = 0; $p <= 0x10; $p++) {
392 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
393 print ',' if $p != 0x10;
394 print "\n";
ac5ea531
JH
395 }
396 print "};\n\n";
8f118dcd 397 close FH;
ac5ea531
JH
398}
399
628bbff0 4001;
ac5ea531 401__END__