This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move SelfLoader from ext/ to dist/
[perl5.git] / ext / Unicode-Normalize / mkheader
CommitLineData
ac5ea531
JH
1#!perl
2#
628bbff0
RGS
3# This auxiliary script makes five header files
4# used for building XSUB of Unicode::Normalize.
ac5ea531 5#
628bbff0
RGS
6# Usage:
7# <do 'mkheader'> in perl, or <perl mkheader> in command line
8#
9# Input files:
10# unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
11# unicore/Decomposition.pl (or unicode/Decomposition.pl)
12# unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
13#
14# Output files:
15# unfcan.h
16# unfcpt.h
17# unfcmb.h
18# unfcmp.h
19# unfexc.h
f027f502 20#
ac5ea531
JH
21use 5.006;
22use strict;
23use warnings;
24use Carp;
6c941e0c
JH
25use File::Spec;
26
9f1f04a1 27BEGIN {
1efaba7f 28 unless ("A" eq pack('U', 0x41)) {
9f1f04a1
RGS
29 die "Unicode::Normalize cannot stringify a Unicode code point\n";
30 }
31}
ac5ea531
JH
32
33our $PACKAGE = 'Unicode::Normalize, mkheader';
34
35our $Combin = do "unicore/CombiningClass.pl"
8f118dcd
JH
36 || do "unicode/CombiningClass.pl"
37 || croak "$PACKAGE: CombiningClass.pl not found";
ac5ea531
JH
38
39our $Decomp = do "unicore/Decomposition.pl"
8f118dcd
JH
40 || do "unicode/Decomposition.pl"
41 || croak "$PACKAGE: Decomposition.pl not found";
ac5ea531 42
48287974 43our %Combin; # $codepoint => $number : combination class
6c941e0c
JH
44our %Canon; # $codepoint => \@codepoints : canonical decomp.
45our %Compat; # $codepoint => \@codepoints : compat. decomp.
46# after _U_stringify(), ($codepoint => $hexstring) for %Canon and %Compat
48287974
TS
47our %Exclus; # $codepoint => 1 : composition exclusions
48our %Single; # $codepoint => 1 : singletons
49our %NonStD; # $codepoint => 1 : non-starter decompositions
50
51our %Comp1st; # $codepoint => $listname : may be composed with a next char.
52our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
53our %CompList; # $listname,$2nd => $codepoint : composite
54
55our $prefix = "UNF_";
56our $structname = "${prefix}complist";
ac5ea531 57
6c941e0c
JH
58########## definition of Hangul constants ##########
59use constant SBase => 0xAC00;
60use constant SFinal => 0xD7A3; # SBase -1 + SCount
61use constant SCount => 11172; # LCount * NCount
62use constant NCount => 588; # VCount * TCount
63use constant LBase => 0x1100;
64use constant LFinal => 0x1112;
65use constant LCount => 19;
66use constant VBase => 0x1161;
67use constant VFinal => 0x1175;
68use constant VCount => 21;
69use constant TBase => 0x11A7;
70use constant TFinal => 0x11C2;
71use constant TCount => 28;
72
73sub decomposeHangul {
74 my $SIndex = $_[0] - SBase;
75 my $LIndex = int( $SIndex / NCount);
76 my $VIndex = int(($SIndex % NCount) / TCount);
77 my $TIndex = $SIndex % TCount;
78 my @ret = (
79 LBase + $LIndex,
80 VBase + $VIndex,
81 $TIndex ? (TBase + $TIndex) : (),
82 );
2b8d773d 83 return @ret;
6c941e0c
JH
84}
85
51683ce6
TS
86########## length of a character ##########
87
88sub utf8len {
89 my $uv = shift;
90 return $uv < 0x80 ? 1 :
91 $uv < 0x800 ? 2 :
92 $uv < 0x10000 ? 3 :
93 $uv < 0x110000 ? 4 :
94 croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff.";
95}
96
97sub utfelen {
98 my $uv = shift;
99 return $uv < 0xA0 ? 1 :
100 $uv < 0x400 ? 2 :
101 $uv < 0x4000 ? 3 :
102 $uv < 0x40000 ? 4 :
103 $uv < 0x110000 ? 5 :
104 croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff.";
105}
106
107my $errExpand = "$PACKAGE: Composition to U+%04X (from U+%04X and U+%04X) " .
108 "needs growing the string in %s! Quit. Please inform the author...";
109
6c941e0c 110########## getting full decomposion ##########
ac5ea531 111{
8f118dcd
JH
112 my($f, $fh);
113 foreach my $d (@INC) {
8f118dcd
JH
114 $f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
115 last if open($fh, $f);
116 $f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
117 last if open($fh, $f);
118 $f = undef;
119 }
48287974
TS
120 croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
121 . "nor unicode/CompExcl.txt is found in @INC" unless defined $f;
122
123 while (<$fh>) {
124 next if /^#/ or /^$/;
125 s/#.*//;
126 $Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
127 }
8f118dcd 128 close $fh;
ac5ea531
JH
129}
130
48287974
TS
131##
132## converts string "hhhh hhhh hhhh" to a numeric list
133##
134sub _getHexArray { map hex, $_[0] =~ /([0-9A-Fa-f]+)/g }
135
8f118dcd
JH
136while ($Combin =~ /(.+)/g) {
137 my @tab = split /\t/, $1;
138 my $ini = hex $tab[0];
139 if ($tab[1] eq '') {
51683ce6 140 $Combin{$ini} = $tab[2];
8f118dcd 141 } else {
51683ce6 142 $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
8f118dcd 143 }
ac5ea531
JH
144}
145
8f118dcd
JH
146while ($Decomp =~ /(.+)/g) {
147 my @tab = split /\t/, $1;
148 my $compat = $tab[2] =~ s/<[^>]+>//;
149 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
51683ce6
TS
150 my $ini = hex($tab[0]);
151 my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
152 # ($ini .. $end) is the range of decomposable characters.
48287974
TS
153
154 my $listname =
155 @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS';
156 # %04x is bad since it'd place _3046 after _1d157.
157
51683ce6
TS
158 foreach my $u ($ini .. $end) {
159 $Compat{$u} = $dec;
8f118dcd
JH
160
161 if (! $compat) {
51683ce6 162 $Canon{$u} = $dec;
8f118dcd 163
f027f502 164 if (@$dec == 2) {
51683ce6
TS
165 if (utf8len($dec->[0]) + utf8len($dec->[1]) < utf8len($u)) {
166 croak sprintf $errExpand, $u, $dec->[0], $dec->[1],
167 "utf-8";
168 }
169 if (utfelen($dec->[0]) + utfelen($dec->[1]) < utfelen($u)) {
170 croak sprintf $errExpand, $u, $dec->[0], $dec->[1],
171 "utf-ebcdic";
172 }
173
8f118dcd 174 if ($Combin{ $dec->[0] }) {
51683ce6 175 $NonStD{$u} = 1;
8f118dcd 176 } else {
51683ce6 177 $CompList{ $listname }{ $dec->[1] } = $u;
48287974 178 $Comp1st{ $dec->[0] } = $listname;
51683ce6 179 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
8f118dcd 180 }
f027f502 181 } elsif (@$dec == 1) {
51683ce6 182 $Single{$u} = 1;
f027f502
JH
183 } else {
184 croak("Weird Canonical Decomposition of U+$tab[0]");
8f118dcd 185 }
8f118dcd 186 }
ac5ea531 187 }
ac5ea531
JH
188}
189
48287974
TS
190# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
191foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
192 $Comp2nd{$j} = 1;
ac5ea531
JH
193}
194
195sub getCanonList {
8f118dcd 196 my @src = @_;
6c941e0c
JH
197 my @dec = map {
198 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
199 : $Canon{$_} ? @{ $Canon{$_} } : $_
200 } @src;
8f118dcd
JH
201 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
202 # condition @src == @dec is not ok.
ac5ea531
JH
203}
204
205sub getCompatList {
8f118dcd 206 my @src = @_;
6c941e0c
JH
207 my @dec = map {
208 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
209 : $Compat{$_} ? @{ $Compat{$_} } : $_
210 } @src;
8f118dcd
JH
211 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
212 # condition @src == @dec is not ok.
ac5ea531
JH
213}
214
48287974
TS
215# exhaustive decomposition
216foreach my $key (keys %Canon) {
217 $Canon{$key} = [ getCanonList($key) ];
218}
219
220# exhaustive decomposition
628bbff0 221foreach my $key (keys %Compat) {
48287974
TS
222 $Compat{$key} = [ getCompatList($key) ];
223}
ac5ea531 224
9f1f04a1 225sub _pack_U {
1efaba7f 226 return pack('U*', @_);
9f1f04a1
RGS
227}
228
2b8d773d
RGS
229sub split_into_char {
230 use bytes;
231 my $uni = shift;
232 my $len = length($uni);
233 my @ary;
234 for(my $i = 0; $i < $len; ++$i) {
235 push @ary, ord(substr($uni,$i,1));
236 }
237 return @ary;
238}
239
ac5ea531 240sub _U_stringify {
8f118dcd 241 sprintf '"%s"', join '',
2b8d773d 242 map sprintf("\\x%02x", $_), split_into_char(_pack_U(@_));
ac5ea531
JH
243}
244
245foreach my $hash (\%Canon, \%Compat) {
8f118dcd
JH
246 foreach my $key (keys %$hash) {
247 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
248 }
ac5ea531
JH
249}
250
6c941e0c 251########## writing header files ##########
ac5ea531 252
8f118dcd
JH
253my @boolfunc = (
254 {
255 name => "Exclusion",
256 type => "bool",
257 hash => \%Exclus,
258 },
259 {
260 name => "Singleton",
261 type => "bool",
262 hash => \%Single,
263 },
264 {
265 name => "NonStDecomp",
266 type => "bool",
267 hash => \%NonStD,
268 },
269 {
270 name => "Comp2nd",
271 type => "bool",
272 hash => \%Comp2nd,
273 },
274);
ac5ea531
JH
275
276my $file = "unfexc.h";
277open FH, ">$file" or croak "$PACKAGE: $file can't be made";
278binmode FH; select FH;
279
8f118dcd
JH
280 print << 'EOF';
281/*
282 * This file is auto-generated by mkheader.
283 * Any changes here will be lost!
284 */
285EOF
ac5ea531 286
8f118dcd
JH
287foreach my $tbl (@boolfunc) {
288 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
289 my $type = $tbl->{type};
290 my $name = $tbl->{name};
291 print "$type is$name (UV uv)\n{\nreturn\n\t";
292
293 while (@temp) {
294 my $cur = shift @temp;
295 if (@temp && $cur + 1 == $temp[0]) {
296 print "($cur <= uv && uv <= ";
297 while (@temp && $cur + 1 == $temp[0]) {
298 $cur = shift @temp;
299 }
300 print "$cur)";
301 print "\n\t|| " if @temp;
302 } else {
303 print "uv == $cur";
304 print "\n\t|| " if @temp;
305 }
ac5ea531 306 }
8f118dcd 307 print "\n\t? TRUE : FALSE;\n}\n\n";
ac5ea531
JH
308}
309
ac5ea531
JH
310close FH;
311
312####################################
313
48287974
TS
314my $compinit =
315 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
316
317foreach my $i (sort keys %CompList) {
318 $compinit .= "$structname $i [] = {\n";
319 $compinit .= join ",\n",
320 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
321 sort {$a <=> $b } keys %{ $CompList{$i} };
322 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
323}
324
ac5ea531 325my @tripletable = (
8f118dcd
JH
326 {
327 file => "unfcmb",
328 name => "combin",
329 type => "STDCHAR",
330 hash => \%Combin,
331 null => 0,
332 },
333 {
334 file => "unfcan",
335 name => "canon",
336 type => "char*",
337 hash => \%Canon,
338 null => "NULL",
339 },
340 {
341 file => "unfcpt",
342 name => "compat",
343 type => "char*",
344 hash => \%Compat,
345 null => "NULL",
346 },
347 {
348 file => "unfcmp",
349 name => "compos",
350 type => "$structname *",
351 hash => \%Comp1st,
352 null => "NULL",
353 init => $compinit,
354 },
ac5ea531
JH
355);
356
357foreach my $tbl (@tripletable) {
8f118dcd
JH
358 my $file = "$tbl->{file}.h";
359 my $head = "${prefix}$tbl->{name}";
360 my $type = $tbl->{type};
361 my $hash = $tbl->{hash};
362 my $null = $tbl->{null};
363 my $init = $tbl->{init};
364
365 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
366 binmode FH; select FH;
367 my %val;
368
369 print FH << 'EOF';
ac5ea531
JH
370/*
371 * This file is auto-generated by mkheader.
372 * Any changes here will be lost!
373 */
374EOF
375
8f118dcd
JH
376 print $init if defined $init;
377
378 foreach my $uv (keys %$hash) {
f027f502
JH
379 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
380 unless $uv <= 0x10FFFF;
8f118dcd
JH
381 my @c = unpack 'CCCC', pack 'N', $uv;
382 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
383 }
384
385 foreach my $p (sort { $a <=> $b } keys %val) {
386 next if ! $val{ $p };
387 for (my $r = 0; $r < 256; $r++) {
388 next if ! $val{ $p }{ $r };
fe067ad9 389 printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
8f118dcd
JH
390 for (my $c = 0; $c < 256; $c++) {
391 print "\t", defined $val{$p}{$r}{$c}
392 ? "($type)".$val{$p}{$r}{$c}
393 : $null;
394 print ',' if $c != 255;
395 print "\n" if $c % 8 == 7;
396 }
397 print "};\n\n";
398 }
399 }
400 foreach my $p (sort { $a <=> $b } keys %val) {
401 next if ! $val{ $p };
fe067ad9 402 printf "static $type* ${head}_%02x [256] = {\n", $p;
8f118dcd
JH
403 for (my $r = 0; $r < 256; $r++) {
404 print $val{ $p }{ $r }
405 ? sprintf("${head}_%02x_%02x", $p, $r)
406 : "NULL";
407 print ',' if $r != 255;
408 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
409 }
410 print "};\n\n";
ac5ea531 411 }
fe067ad9 412 print "static $type** $head [] = {\n";
8f118dcd
JH
413 for (my $p = 0; $p <= 0x10; $p++) {
414 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
415 print ',' if $p != 0x10;
416 print "\n";
ac5ea531
JH
417 }
418 print "};\n\n";
8f118dcd 419 close FH;
ac5ea531
JH
420}
421
628bbff0 4221;
ac5ea531 423__END__