This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Tweak the equivalence tables once again.
[perl5.git] / lib / unicode / mktables.PL
CommitLineData
a0ed51b3
LW
1#!../../miniperl
2
11695a73
JH
3$UnicodeData = "UnicodeData-Latest.txt";
4
a0ed51b3
LW
5# Note: we try to keep filenames unique within first 8 chars. Using
6# subdirectories for the following helps.
7mkdir "In", 0777;
8mkdir "Is", 0777;
9mkdir "To", 0777;
11695a73 10mkdir "Eq", 0777;
a0ed51b3
LW
11
12@todo = (
13# typical
14
b8c5462f
JH
15 ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
16 ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''],
17 ['IsAlpha', '$cat =~ /^L[ulo]/', ''],
18 ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
19 ['IsDigit', '$cat =~ /^Nd$/', ''],
20 ['IsUpper', '$cat =~ /^Lu$/', ''],
21 ['IsLower', '$cat =~ /^Ll$/', ''],
22 ['IsASCII', 'hex $code <= 127', ''],
23 ['IsCntrl', '$cat =~ /^C/', ''],
24 ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
25 ['IsPrint', '$cat =~ /^[^C]/', ''],
26 ['IsPunct', '$cat =~ /^P/', ''],
27 ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
28 ['ToUpper', '$up', '$up'],
29 ['ToLower', '$down', '$down'],
30 ['ToTitle', '$title', '$title'],
31 ['ToDigit', '$dec ne ""', '$dec'],
a0ed51b3
LW
32
33# Name
34
35 ['Name', '$name', '$name'],
36
37# Category
38
39 ['Category', '$cat', '$cat'],
40
41# Normative
42
43 ['IsM', '$cat =~ /^M/', ''], # Mark
44 ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
45 ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
46
47 ['IsN', '$cat =~ /^N/', ''], # Number
48 ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
49 ['IsNo', '$cat eq "No"', ''], # Number, Other
50
51 ['IsZ', '$cat =~ /^Z/', ''], # Zeparator
52 ['IsZs', '$cat eq "Zs"', ''], # Separator, Space
53 ['IsZl', '$cat eq "Zl"', ''], # Separator, Line
54 ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
55
56 ['IsC', '$cat =~ /^C/', ''], # Crazy
57 ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
58 ['IsCo', '$cat eq "Co"', ''], # Other, Private Use
59 ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
60
61# Informative
62
63 ['IsL', '$cat =~ /^L/', ''], # Letter
64 ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
65 ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
66 ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
67 ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
68 ['IsLo', '$cat eq "Lo"', ''], # Letter, Other
69
70 ['IsP', '$cat =~ /^P/', ''], # Punctuation
71 ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
72 ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
73 ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
74 ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
75
76 ['IsS', '$cat =~ /^S/', ''], # Symbol
77 ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
78 ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
79 ['IsSo', '$cat eq "So"', ''], # Symbol, Other
80
81# Combining class
82 ['CombiningClass', '$comb', '$comb'],
83
84# BIDIRECTIONAL PROPERTIES
85
86 ['Bidirectional', '$bid', '$bid'],
87
88# Strong types:
89
90 ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
91 # syllabic, and logographic
92 # characters (e.g., CJK
93 # ideographs)
94 ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
95 # and punctuation specific to
96 # those scripts
97
98# Weak types:
99
100 ['IsBidiEN','$bid eq "EN"', ''], # European Number
101 ['IsBidiES','$bid eq "ES"', ''], # European Number Separator
102 ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
103 ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
104 ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
105
106# Separators:
107
108 ['IsBidiB', '$bid eq "B"', ''], # Block Separator
109 ['IsBidiS', '$bid eq "S"', ''], # Segment Separator
110
111# Neutrals:
112
113 ['IsBidiWS','$bid eq "WS"', ''], # Whitespace
114 ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
115 # characters: punctuation,
116 # symbols
117
118# Decomposition
119
120 ['Decomposition', '$decomp', '$decomp'],
121 ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
122 ['IsDecoCompat', '$decomp =~ /^</', ''],
123 ['IsDCfont', '$decomp =~ /^<font>/', ''],
124 ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
125 ['IsDCinitial', '$decomp =~ /^<initial>/', ''],
126 ['IsDCinital', '$decomp =~ /^<medial>/', ''],
127 ['IsDCfinal', '$decomp =~ /^<final>/', ''],
128 ['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
129 ['IsDCcircle', '$decomp =~ /^<circle>/', ''],
130 ['IsDCsuper', '$decomp =~ /^<super>/', ''],
131 ['IsDCsub', '$decomp =~ /^<sub>/', ''],
132 ['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
133 ['IsDCwide', '$decomp =~ /^<wide>/', ''],
134 ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
135 ['IsDCsmall', '$decomp =~ /^<small>/', ''],
136 ['IsDCsquare', '$decomp =~ /^<square>/', ''],
137 ['IsDCcompat', '$decomp =~ /^<compat>/', ''],
138
139# Number
140
141 ['Number', '$num', '$num'],
142
143# Mirrored
144
145 ['IsMirrored', '$mir eq "Y"', ''],
146
147# Arabic
148
149 ['ArabLink', '1', '$link'],
150 ['ArabLnkGrp', '1', '$linkgroup'],
151
152# Jamo
153
154 ['JamoShort', '1', '$short'],
499bfa7a
JH
155
156# Syllables
157
158 ['IsSylV', '$syl eq "V"', ''],
159 ['IsSylU', '$syl eq "U"', ''],
160 ['IsSylI', '$syl eq "I"', ''],
161 ['IsSylA', '$syl eq "A"', ''],
162 ['IsSylE', '$syl eq "E"', ''],
163 ['IsSylC', '$syl eq "C"', ''],
164 ['IsSylO', '$syl eq "O"', ''],
165 ['IsSylWV', '$syl eq "V"', ''],
166 ['IsSylWI', '$syl eq "I"', ''],
167 ['IsSylWA', '$syl eq "A"', ''],
168 ['IsSylWE', '$syl eq "E"', ''],
169 ['IsSylWC', '$syl eq "C"', ''],
a0ed51b3
LW
170);
171
172# This is not written for speed...
173
174foreach $file (@todo) {
175 my ($table, $wanted, $val) = @$file;
176 next if @ARGV and not grep { $_ eq $table } @ARGV;
177 print $table,"\n";
178 if ($table =~ /^(Is|In|To)(.*)/) {
179 open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
180 }
181 else {
182 open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
183 }
184 print OUT <<"END";
185return <<'END';
186END
187 print OUT proplist($table, $wanted, $val);
188 print OUT "END\n";
189 close OUT;
190}
191
192# Must treat blocks specially.
193
194exit if @ARGV and not grep { $_ eq Block } @ARGV;
195print "Block\n";
d357d9fe 196open(UD, 'Blocks.txt') or die "Can't open blocks.txt: $!\n";
a0ed51b3
LW
197open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
198print OUT <<"END";
199return <<'END';
200END
201
202while (<UD>) {
203 next if /^#/;
204 next if /^$/;
205 chomp;
206 ($code, $last, $name) = split(/; */);
207 if ($name) {
208 print OUT "$code $last $name\n";
209 $name =~ s/\s+//g;
210 open(BLOCK, ">In/$name.pl");
211 print BLOCK <<"END2";
212return <<'END';
213$code $last
214END
215END2
216 close BLOCK;
217 }
218}
219
220print OUT "END\n";
221close OUT;
222
223##################################################
224
225sub proplist {
226 my ($table, $wanted, $val) = @_;
227 my @wanted;
228 my $out;
229 my $split;
230
231 if ($table =~ /^Arab/) {
d357d9fe 232 open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
a0ed51b3
LW
233
234 $split = '($code, $name, $link, $linkgroup) = split(/; */);';
235 }
236 elsif ($table =~ /^Jamo/) {
d357d9fe 237 open(UD, "Jamo-2.txt") or warn "Can't open $table: $!";
a0ed51b3
LW
238
239 $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
240 }
499bfa7a
JH
241 elsif ($table =~ /^IsSyl/) {
242 open(UD, "syllables.txt") or warn "Can't open $table: $!";
243
244 $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
245 }
a0ed51b3 246 else {
11695a73 247 open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
a0ed51b3
LW
248
249 $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
250 $comment, $up, $down, $title) = split(/;/);';
251 }
252
253 if ($table =~ /^(?:To|Is)[A-Z]/) {
254 eval <<"END";
255 while (<UD>) {
256 next if /^#/;
257 next if /^\s/;
258 chop;
259 $split
260 if ($wanted) {
261 push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
262 }
263 }
264END
265 die $@ if $@;
266
267 while (@wanted) {
268 $beg = shift @wanted;
269 $last = $beg;
270 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
271 (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
272 $last = shift @wanted;
273 }
274 $out .= sprintf "%04x", $beg->[0];
275 if ($beg->[2]) {
276 $last = shift @wanted;
277 }
278 if ($beg == $last) {
279 $out .= "\t";
280 }
281 else {
282 $out .= sprintf "\t%04x", $last->[0];
283 }
284 $out .= sprintf "\t%04x", $beg->[1] if $val;
285 $out .= "\n";
286 }
287 }
288 else {
289 eval <<"END";
290 while (<UD>) {
291 next if /^#/;
292 next if /^\s*\$/;
293 chop;
294 $split
295 if ($wanted) {
296 push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
297 }
298 }
299END
300 die $@ if $@;
301
302 while (@wanted) {
303 $beg = shift @wanted;
304 $last = $beg;
305 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
306 ($wanted[0]->[1] eq $last->[1])) {
307 $last = shift @wanted;
308 }
309 $out .= sprintf "%04x", $beg->[0];
310 if ($beg->[2]) {
311 $last = shift @wanted;
312 }
313 if ($beg == $last) {
314 $out .= "\t";
315 }
316 else {
317 $out .= sprintf "\t%04x", $last->[0];
318 }
319 $out .= sprintf "\t%s\n", $beg->[1];
320 }
321 }
322 $out;
323}
11695a73 324
11eeea96
JH
325# Create the equivalence mappings.
326
13a0e1a7
JH
327open(UNICODEDATA, $UnicodeData) || die "$0: $UnicodeData: $!\n";
328
329while (<UNICODEDATA>) {
6dd159d1 330 ($code, $name, $category, $decomposition) = (split /;/)[0,1,2,5];
13a0e1a7
JH
331
332 $code{$name} = $code;
333 $name{$code} = $name;
39f2df93
JH
334 $category{$code} = $category;
335
336 next unless $category =~ /^L/;
337
338 # The definition of "equivalence" is twofold.
339 if ($decomposition ne '') {
340 # (1) If there's an official Unicode decomposition
341 # and the base is a Unicode letter.
342 $decomposition =~ s/^<\w+> //;
343 @decomposition = split(' ', $decomposition);
344 # Some Arabic ligatures like
345 # FC5E;ARABIC LIGATURE SHADDA WITH DAMMATAN ISOLATED FORM;Lo;...
346 # are problematic because their decomposition begins with
347 # a space (0020) -- which could be just skipped -- but then
348 # their base glyph is not a letter, for example
349 # the above decomposes as <isolated> 0020 064C 0651,
350 # but 064C is 064C;ARABIC DAMMATAN;Mn;...
351 # (the 0651 being ARABIC SHADDA;Mn)
352 ($basecode) = shift @decomposition;
353 push @base, [ $code, $basecode ];
354 } elsif ($name =~ /^(.+?) WITH /) {
355 # (2) If there's a "FOO WITH ..." Unicode name and FOO
356 # happens to be valid Unicode letter. This is
357 # a debatable definition and all fault is by me (jhi).
358 # For example this definition adds
359 # LATIN SMALL LETTER O WITH STROKE
360 # as a derivative of
361 # LATIN SMALL LETTER O
362 # which some might rightfully contest, especially
363 # the speakers of languages who have the former
364 # phonetically as very distinct from the latter.
365 push @with, [ $code, $1 ];
366 }
367}
13a0e1a7 368
39f2df93
JH
369foreach my $w (@with) {
370 ($code, $basename) = @$w;
371 next if not exists $code{$basename} or
372 not $category{$code{$basename}} =~ /^L/;
373 push @base, [ $code, $code{$basename} ];
13a0e1a7
JH
374}
375
39f2df93
JH
376@base = sort { $a->[0] cmp $b->[0] } @base;
377
13a0e1a7 378foreach my $b (@base) {
6dd159d1 379 ($code, $basecode) = @$b;
39f2df93
JH
380 $basename = $name{$basecode};
381 next if not defined $basename or
382 not exists $code{$basename} or
383 not $category{$code{$basename}} =~ /^L/;
384 push @{$unicode{$code{$basename}}}, $code;
385# print "$code: $name{$code} -> $basename\n",
13a0e1a7
JH
386}
387
388@unicode = sort keys %unicode;
389
883d4c97 390print "EqUnicode\n";
13a0e1a7
JH
391if (open(EQ_UNICODE, ">Eq/Unicode")) {
392 foreach my $c (@unicode) {
393 print EQ_UNICODE "$c @{$unicode{$c}}\n";
394 }
395 close EQ_UNICODE;
396} else {
397 die "$0: failed to open Eq/Unicode for writing: $!\n";
398}
399
883d4c97 400print "EqLatin1\n";
13a0e1a7
JH
401if (open(EQ_LATIN1, ">Eq/Latin1")) {
402 foreach my $c (@unicode) {
403 last if hex($c) > 255;
39f2df93 404 my @c = grep { hex($_) < 256 } @{$unicode{$c}};
13a0e1a7
JH
405 next unless @c;
406 print EQ_LATIN1 "$c @c\n";
407 }
408 close EQ_LATIN1;
409} else {
410 die "$0: failed to open Eq/Latin1 for writing: $!\n";
411}
6dd159d1
JH
412
413# eof