This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update Module::CoreLiast for 5.25.7
[perl5.git] / dist / Unicode-Normalize / mkheader
CommitLineData
c6b7cc21
SH
1#!perl
2#
3# This auxiliary script makes five header files
4# used for building XSUB of Unicode::Normalize.
5#
6# Usage:
7# <do 'mkheader'> in perl, or <perl mkheader> in command line
8#
9# Input files:
10# unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
11# unicore/Decomposition.pl (or unicode/Decomposition.pl)
12#
13# Output files:
14# unfcan.h
15# unfcpt.h
16# unfcmb.h
17# unfcmp.h
18# unfexc.h
19#
20use 5.006;
21use strict;
22use warnings;
23use Carp;
24use File::Spec;
25use SelectSaver;
26
27BEGIN {
28 unless ('A' eq pack('U', 0x41)) {
29 die "Unicode::Normalize cannot stringify a Unicode code point\n";
30 }
31 unless (0x41 == unpack('U', 'A')) {
32 die "Unicode::Normalize cannot get Unicode code point\n";
33 }
34}
35
36our $PACKAGE = 'Unicode::Normalize, mkheader';
37
38our $prefix = "UNF_";
39our $structname = "${prefix}complist";
40
41# Starting in v5.20, the tables in lib/unicore are built using the platform's
42# native character set for code points 0-255.
43*pack_U = ($] ge 5.020)
44 ? sub { return pack('W*', @_).pack('U*'); } # The empty pack returns
45 # an empty UTF-8 string,
46 # so the effect is to
47 # force the return into
48 # being UTF-8.
49 : sub { return pack('U*', @_); };
50
51# %Canon and %Compat will be ($codepoint => $hexstring) after _U_stringify()
52our %Comp1st; # $codepoint => $listname : may be composed with a next char.
53our %CompList; # $listname,$2nd => $codepoint : composite
54
55##### The below part is common to mkheader and PP #####
56
57our %Combin; # $codepoint => $number : combination class
58our %Canon; # $codepoint => \@codepoints : canonical decomp.
59our %Compat; # $codepoint => \@codepoints : compat. decomp.
60our %Compos; # $1st,$2nd => $codepoint : composite
61our %Exclus; # $codepoint => 1 : composition exclusions
62our %Single; # $codepoint => 1 : singletons
63our %NonStD; # $codepoint => 1 : non-starter decompositions
64our %Comp2nd; # $codepoint => 1 : may be composed with a prev char.
65
66# from core Unicode database
67our $Combin = do "unicore/CombiningClass.pl"
68 || do "unicode/CombiningClass.pl"
69 || croak "$PACKAGE: CombiningClass.pl not found";
70our $Decomp = do "unicore/Decomposition.pl"
71 || do "unicode/Decomposition.pl"
72 || croak "$PACKAGE: Decomposition.pl not found";
73
74# CompositionExclusions.txt since Unicode 3.2.0. If this ever changes, it
75# would be better to get the values from Unicode::UCD rather than hard-code
76# them here, as that will protect from having to make fixes for future
77# changes.
78our @CompEx = qw(
79 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
80 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
81 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
82 FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
83 FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
84 FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
85 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
86);
87
88# definition of Hangul constants
89use constant SBase => 0xAC00;
90use constant SFinal => 0xD7A3; # SBase -1 + SCount
91use constant SCount => 11172; # LCount * NCount
92use constant NCount => 588; # VCount * TCount
93use constant LBase => 0x1100;
94use constant LFinal => 0x1112;
95use constant LCount => 19;
96use constant VBase => 0x1161;
97use constant VFinal => 0x1175;
98use constant VCount => 21;
99use constant TBase => 0x11A7;
100use constant TFinal => 0x11C2;
101use constant TCount => 28;
102
103sub decomposeHangul {
104 my $sindex = $_[0] - SBase;
105 my $lindex = int( $sindex / NCount);
106 my $vindex = int(($sindex % NCount) / TCount);
107 my $tindex = $sindex % TCount;
108 my @ret = (
109 LBase + $lindex,
110 VBase + $vindex,
111 $tindex ? (TBase + $tindex) : (),
112 );
113 return wantarray ? @ret : pack_U(@ret);
114}
115
116########## getting full decomposition ##########
117
118## converts string "hhhh hhhh hhhh" to a numeric list
119## (hex digits separated by spaces)
120sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g }
121
122while ($Combin =~ /(.+)/g) {
123 my @tab = split /\t/, $1;
124 my $ini = hex $tab[0];
125 if ($tab[1] eq '') {
126 $Combin{$ini} = $tab[2];
127 } else {
128 $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
129 }
130}
131
132while ($Decomp =~ /(.+)/g) {
133 my @tab = split /\t/, $1;
134 my $compat = $tab[2] =~ s/<[^>]+>//;
135 my $dec = [ _getHexArray($tab[2]) ]; # decomposition
136 my $ini = hex($tab[0]); # initial decomposable character
137 my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
138 # ($ini .. $end) is the range of decomposable characters.
139
140 foreach my $u ($ini .. $end) {
141 $Compat{$u} = $dec;
142 $Canon{$u} = $dec if ! $compat;
143 }
144}
145
146for my $s (@CompEx) {
147 my $u = hex $s;
148 next if !$Canon{$u}; # not assigned
149 next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
150 $Exclus{$u} = 1;
151}
152
153foreach my $u (keys %Canon) {
154 my $dec = $Canon{$u};
155
156 if (@$dec == 2) {
157 if ($Combin{ $dec->[0] }) {
158 $NonStD{$u} = 1;
159 } else {
160 $Compos{ $dec->[0] }{ $dec->[1] } = $u;
161 $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
162 }
163 } elsif (@$dec == 1) {
164 $Single{$u} = 1;
165 } else {
166 my $h = sprintf '%04X', $u;
167 croak("Weird Canonical Decomposition of U+$h");
168 }
169}
170
171# modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo
172foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) {
173 $Comp2nd{$j} = 1;
174}
175
176sub getCanonList {
177 my @src = @_;
178 my @dec = map {
179 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
180 : $Canon{$_} ? @{ $Canon{$_} } : $_
181 } @src;
182 return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec);
183 # condition @src == @dec is not ok.
184}
185
186sub getCompatList {
187 my @src = @_;
188 my @dec = map {
189 (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_)
190 : $Compat{$_} ? @{ $Compat{$_} } : $_
191 } @src;
192 return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec);
193 # condition @src == @dec is not ok.
194}
195
196# exhaustive decomposition
197foreach my $key (keys %Canon) {
198 $Canon{$key} = [ getCanonList($key) ];
199}
200
201# exhaustive decomposition
202foreach my $key (keys %Compat) {
203 $Compat{$key} = [ getCompatList($key) ];
204}
205
206##### The above part is common to mkheader and PP #####
207
208foreach my $comp1st (keys %Compos) {
209 my $listname = sprintf("${structname}_%06x", $comp1st);
210 # %04x is bad since it'd place _3046 after _1d157.
211 $Comp1st{$comp1st} = $listname;
212 my $rh1st = $Compos{$comp1st};
213
214 foreach my $comp2nd (keys %$rh1st) {
215 my $uc = $rh1st->{$comp2nd};
216 $CompList{$listname}{$comp2nd} = $uc;
217 }
218}
219
220sub split_into_char {
221 use bytes;
222 my $uni = shift;
223 my $len = length($uni);
224 my @ary;
225 for(my $i = 0; $i < $len; ++$i) {
226 push @ary, ord(substr($uni,$i,1));
227 }
228 return @ary;
229}
230
231sub _U_stringify {
232 sprintf '"%s"', join '',
233 map sprintf("\\x%02x", $_), split_into_char(pack_U(@_));
234}
235
236foreach my $hash (\%Canon, \%Compat) {
237 foreach my $key (keys %$hash) {
238 $hash->{$key} = _U_stringify( @{ $hash->{$key} } );
239 }
240}
241
242########## writing header files ##########
243
244my @boolfunc = (
245 {
246 name => "Exclusion",
247 type => "bool",
248 hash => \%Exclus,
249 },
250 {
251 name => "Singleton",
252 type => "bool",
253 hash => \%Single,
254 },
255 {
256 name => "NonStDecomp",
257 type => "bool",
258 hash => \%NonStD,
259 },
260 {
261 name => "Comp2nd",
262 type => "bool",
263 hash => \%Comp2nd,
264 },
265);
266
267my $orig_fh = SelectSaver->new;
268{
269
270my $file = "unfexc.h";
271open FH, ">$file" or croak "$PACKAGE: $file can't be made";
272binmode FH; select FH;
273
274 print << 'EOF';
275/*
276 * This file is auto-generated by mkheader.
277 * Any changes here will be lost!
278 */
279EOF
280
281foreach my $tbl (@boolfunc) {
282 my @temp = sort {$a <=> $b} keys %{$tbl->{hash}};
283 my $type = $tbl->{type};
284 my $name = $tbl->{name};
285 print "$type is$name (UV uv)\n{\nreturn\n\t";
286
287 while (@temp) {
288 my $cur = shift @temp;
289 if (@temp && $cur + 1 == $temp[0]) {
290 print "($cur <= uv && uv <= ";
291 while (@temp && $cur + 1 == $temp[0]) {
292 $cur = shift @temp;
293 }
294 print "$cur)";
295 print "\n\t|| " if @temp;
296 } else {
297 print "uv == $cur";
298 print "\n\t|| " if @temp;
299 }
300 }
301 print "\n\t? TRUE : FALSE;\n}\n\n";
302}
303
304close FH;
305
306####################################
307
308my $compinit =
309 "typedef struct { UV nextchar; UV composite; } $structname;\n\n";
310
311foreach my $i (sort keys %CompList) {
312 $compinit .= "$structname $i [] = {\n";
313 $compinit .= join ",\n",
314 map sprintf("\t{ %d, %d }", $_, $CompList{$i}{$_}),
315 sort {$a <=> $b } keys %{ $CompList{$i} };
316 $compinit .= ",\n{0,0}\n};\n\n"; # with sentinel
317}
318
319my @tripletable = (
320 {
321 file => "unfcmb",
322 name => "combin",
323 type => "STDCHAR",
324 hash => \%Combin,
325 null => 0,
326 },
327 {
328 file => "unfcan",
329 name => "canon",
330 type => "char*",
331 hash => \%Canon,
332 null => "NULL",
333 },
334 {
335 file => "unfcpt",
336 name => "compat",
337 type => "char*",
338 hash => \%Compat,
339 null => "NULL",
340 },
341 {
342 file => "unfcmp",
343 name => "compos",
344 type => "$structname *",
345 hash => \%Comp1st,
346 null => "NULL",
347 init => $compinit,
348 },
349);
350
351foreach my $tbl (@tripletable) {
352 my $file = "$tbl->{file}.h";
353 my $head = "${prefix}$tbl->{name}";
354 my $type = $tbl->{type};
355 my $hash = $tbl->{hash};
356 my $null = $tbl->{null};
357 my $init = $tbl->{init};
358
359 open FH, ">$file" or croak "$PACKAGE: $file can't be made";
360 binmode FH; select FH;
361 my %val;
362
363 print FH << 'EOF';
364/*
365 * This file is auto-generated by mkheader.
366 * Any changes here will be lost!
367 */
368EOF
369
370 print $init if defined $init;
371
372 foreach my $uv (keys %$hash) {
373 croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv)
374 unless $uv <= 0x10FFFF;
375 my @c = unpack 'CCCC', pack 'N', $uv;
376 $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv};
377 }
378
379 foreach my $p (sort { $a <=> $b } keys %val) {
380 next if ! $val{ $p };
381 for (my $r = 0; $r < 256; $r++) {
382 next if ! $val{ $p }{ $r };
383 printf "static $type ${head}_%02x_%02x [256] = {\n", $p, $r;
384 for (my $c = 0; $c < 256; $c++) {
385 print "\t", defined $val{$p}{$r}{$c}
386 ? "($type)".$val{$p}{$r}{$c}
387 : $null;
388 print ',' if $c != 255;
389 print "\n" if $c % 8 == 7;
390 }
391 print "};\n\n";
392 }
393 }
394 foreach my $p (sort { $a <=> $b } keys %val) {
395 next if ! $val{ $p };
396 printf "static $type* ${head}_%02x [256] = {\n", $p;
397 for (my $r = 0; $r < 256; $r++) {
398 print $val{ $p }{ $r }
399 ? sprintf("${head}_%02x_%02x", $p, $r)
400 : "NULL";
401 print ',' if $r != 255;
402 print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0;
403 }
404 print "};\n\n";
405 }
406 print "static $type** $head [] = {\n";
407 for (my $p = 0; $p <= 0x10; $p++) {
408 print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL";
409 print ',' if $p != 0x10;
410 print "\n";
411 }
412 print "};\n\n";
413 close FH;
414}
415
416} # End of block for SelectSaver
417
4181;
419__END__