This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
integrate changes#7069..7077,7079,7081..7087,7090,7092,7093,
[perl5.git] / lib / unicode / mktables.PL
CommitLineData
a0ed51b3
LW
1#!../../miniperl
2
7c6f5cd2
GS
3use bytes;
4
1e8df282 5$UnicodeData = "Unicode.301";
7c6f5cd2 6$SyllableData = "syllables.txt";
1e8df282 7$PropData = "PropList.txt";
7c6f5cd2 8
11695a73 9
a0ed51b3
LW
10# Note: we try to keep filenames unique within first 8 chars. Using
11# subdirectories for the following helps.
d6f4130b
GS
12mkdir "In", 0755;
13mkdir "Is", 0755;
14mkdir "To", 0755;
a0ed51b3
LW
15
16@todo = (
17# typical
18
6ab16782
GS
19 ['IsWord', '$cat =~ /^L[ulot]|^Nd/ or $code eq "005F"', ''],
20 ['IsAlnum', '$cat =~ /^L[ulot]|^Nd/', ''],
21 ['IsAlpha', '$cat =~ /^L[ulot]/', ''],
7c6f5cd2 22 ['IsSpace', 'White space', $PropData],
b8c5462f 23 ['IsDigit', '$cat =~ /^Nd$/', ''],
7c6f5cd2 24 ['IsUpper', '$cat =~ /^L[ut]$/', ''],
b8c5462f
JH
25 ['IsLower', '$cat =~ /^Ll$/', ''],
26 ['IsASCII', 'hex $code <= 127', ''],
27 ['IsCntrl', '$cat =~ /^C/', ''],
7c6f5cd2 28 ['IsGraph', '$cat =~ /^[^C]/ and ($cat !~ /^Z/ and $code ne "0020" or chr(hex $code) !~ /^\s/)', ''],
b8c5462f 29 ['IsPrint', '$cat =~ /^[^C]/', ''],
7c6f5cd2 30 ['IsPunct', 'Punctuation', $PropData],
b8c5462f
JH
31 ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
32 ['ToUpper', '$up', '$up'],
33 ['ToLower', '$down', '$down'],
34 ['ToTitle', '$title', '$title'],
35 ['ToDigit', '$dec ne ""', '$dec'],
a0ed51b3
LW
36
37# Name
38
39 ['Name', '$name', '$name'],
40
41# Category
42
43 ['Category', '$cat', '$cat'],
44
45# Normative
46
47 ['IsM', '$cat =~ /^M/', ''], # Mark
48 ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
49 ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
7c6f5cd2 50 ['IsMe', '$cat eq "Me"', ''], # Mark, Enclosing
a0ed51b3
LW
51
52 ['IsN', '$cat =~ /^N/', ''], # Number
53 ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
54 ['IsNo', '$cat eq "No"', ''], # Number, Other
7c6f5cd2 55 ['IsNl', '$cat eq "Nl"', ''], # Number, Letter
a0ed51b3 56
7c6f5cd2 57 ['IsZ', '$cat =~ /^Z/', ''], # Separator
a0ed51b3
LW
58 ['IsZs', '$cat eq "Zs"', ''], # Separator, Space
59 ['IsZl', '$cat eq "Zl"', ''], # Separator, Line
60 ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
61
62 ['IsC', '$cat =~ /^C/', ''], # Crazy
63 ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
64 ['IsCo', '$cat eq "Co"', ''], # Other, Private Use
65 ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
7c6f5cd2
GS
66 ['IsCf', '$cat eq "Cf"', ''], # Other, Format
67 ['IsCs', '$cat eq "Cs"', ''], # Other, Surrogate
68 ['IsCn', 'Unassigned Code Value',$PropData], # Other, Not Assigned
a0ed51b3
LW
69
70# Informative
71
72 ['IsL', '$cat =~ /^L/', ''], # Letter
73 ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
74 ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
75 ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
76 ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
77 ['IsLo', '$cat eq "Lo"', ''], # Letter, Other
78
79 ['IsP', '$cat =~ /^P/', ''], # Punctuation
80 ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
81 ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
82 ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
83 ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
7c6f5cd2
GS
84 ['IsPc', '$cat eq "Pc"', ''], # Punctuation, Connector
85 ['IsPi', '$cat eq "Pi"', ''], # Punctuation, Initial quote
86 ['IsPf', '$cat eq "Pf"', ''], # Punctuation, Final quote
a0ed51b3
LW
87
88 ['IsS', '$cat =~ /^S/', ''], # Symbol
89 ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
7c6f5cd2 90 ['IsSk', '$cat eq "Sk"', ''], # Symbol, Modifier
a0ed51b3
LW
91 ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
92 ['IsSo', '$cat eq "So"', ''], # Symbol, Other
93
94# Combining class
95 ['CombiningClass', '$comb', '$comb'],
96
97# BIDIRECTIONAL PROPERTIES
98
99 ['Bidirectional', '$bid', '$bid'],
100
101# Strong types:
102
103 ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
104 # syllabic, and logographic
105 # characters (e.g., CJK
106 # ideographs)
107 ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
108 # and punctuation specific to
109 # those scripts
110
7c6f5cd2
GS
111 ['IsBidiLRE', '$bid eq "LRE"', ''], # Left-to-Right Embedding
112 ['IsBidiLRO', '$bid eq "LRO"', ''], # Left-to-Right Override
113 ['IsBidiAL', '$bid eq "AL"', ''], # Right-to-Left Arabic
114 ['IsBidiRLE', '$bid eq "RLE"', ''], # Right-to-Left Embedding
115 ['IsBidiRLO', '$bid eq "RLO"', ''], # Right-to-Left Override
116 ['IsBidiPDF', '$bid eq "PDF"', ''], # Pop Directional Format
117 ['IsBidiNSM', '$bid eq "NSM"', ''], # Non-Spacing Mark
118 ['IsBidiBN', '$bid eq "BN"', ''], # Boundary Neutral
119
a0ed51b3
LW
120# Weak types:
121
122 ['IsBidiEN','$bid eq "EN"', ''], # European Number
123 ['IsBidiES','$bid eq "ES"', ''], # European Number Separator
124 ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
125 ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
126 ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
127
128# Separators:
129
130 ['IsBidiB', '$bid eq "B"', ''], # Block Separator
131 ['IsBidiS', '$bid eq "S"', ''], # Segment Separator
132
133# Neutrals:
134
135 ['IsBidiWS','$bid eq "WS"', ''], # Whitespace
136 ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
137 # characters: punctuation,
138 # symbols
139
140# Decomposition
141
142 ['Decomposition', '$decomp', '$decomp'],
143 ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
144 ['IsDecoCompat', '$decomp =~ /^</', ''],
145 ['IsDCfont', '$decomp =~ /^<font>/', ''],
146 ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
147 ['IsDCinitial', '$decomp =~ /^<initial>/', ''],
148 ['IsDCinital', '$decomp =~ /^<medial>/', ''],
149 ['IsDCfinal', '$decomp =~ /^<final>/', ''],
150 ['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
151 ['IsDCcircle', '$decomp =~ /^<circle>/', ''],
152 ['IsDCsuper', '$decomp =~ /^<super>/', ''],
153 ['IsDCsub', '$decomp =~ /^<sub>/', ''],
154 ['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
155 ['IsDCwide', '$decomp =~ /^<wide>/', ''],
156 ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
157 ['IsDCsmall', '$decomp =~ /^<small>/', ''],
158 ['IsDCsquare', '$decomp =~ /^<square>/', ''],
7c6f5cd2 159 ['IsDCfraction', '$decomp =~ /^<fraction>/', ''],
a0ed51b3
LW
160 ['IsDCcompat', '$decomp =~ /^<compat>/', ''],
161
162# Number
163
0139fc7d 164 ['Number', '$num ne ""', '$num'],
a0ed51b3
LW
165
166# Mirrored
167
168 ['IsMirrored', '$mir eq "Y"', ''],
169
170# Arabic
171
172 ['ArabLink', '1', '$link'],
173 ['ArabLnkGrp', '1', '$linkgroup'],
174
175# Jamo
176
177 ['JamoShort', '1', '$short'],
499bfa7a
JH
178
179# Syllables
180
7c6f5cd2
GS
181 syllable_defs(),
182
a77b4ae5
GS
183# Line break properties - Normative
184
185 ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break
186 ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return
187 ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed
188 ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks
189 ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates
190 ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue)
191 ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity
192 ['IsLbrkSP','$brk eq "SP"', ''], # Space
193 ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space
194
195# Line break properties - Informative
196 ['IsLbrkXX','$brk eq "XX"', ''], # Unknown
197 ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation
198 ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation
199 ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation
200 ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter
201 ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation
202 ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks
203 ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric)
204 ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric)
205 ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric)
206 ['IsLbrkNU','$brk eq "NU"', ''], # Numeric
207 ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters
208 ['IsLbrkID','$brk eq "ID"', ''], # Ideographic
209 ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable
210 ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen
211 ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before
212 ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After
213 ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian)
214 ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic)
215 ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After
a0ed51b3
LW
216);
217
218# This is not written for speed...
219
220foreach $file (@todo) {
221 my ($table, $wanted, $val) = @$file;
222 next if @ARGV and not grep { $_ eq $table } @ARGV;
223 print $table,"\n";
224 if ($table =~ /^(Is|In|To)(.*)/) {
225 open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
226 }
227 else {
228 open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
229 }
14055466
JH
230 print OUT <<EOH;
231# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
232# This file is built by $0 from e.g. $UnicodeData.
233# Any changes made here will be lost!
234EOH
a0ed51b3
LW
235 print OUT <<"END";
236return <<'END';
237END
238 print OUT proplist($table, $wanted, $val);
239 print OUT "END\n";
240 close OUT;
241}
242
243# Must treat blocks specially.
244
245exit if @ARGV and not grep { $_ eq Block } @ARGV;
246print "Block\n";
7c6f5cd2
GS
247open(UD, 'Blocks.txt') or die "Can't open Blocks.txt: $!\n";
248open(OUT, ">Block.pl") or die "Can't create Block.pl: $!\n";
14055466
JH
249print OUT <<EOH;
250# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
251# This file is built by $0 from e.g. $UnicodeData.
252# Any changes made here will be lost!
253EOH
a0ed51b3
LW
254print OUT <<"END";
255return <<'END';
256END
257
258while (<UD>) {
259 next if /^#/;
260 next if /^$/;
261 chomp;
262 ($code, $last, $name) = split(/; */);
263 if ($name) {
264 print OUT "$code $last $name\n";
265 $name =~ s/\s+//g;
266 open(BLOCK, ">In/$name.pl");
14055466
JH
267 print BLOCK <<EOH;
268# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
269# This file is built by $0 from e.g. $UnicodeData.
270# Any changes made here will be lost!
271EOH
a0ed51b3
LW
272 print BLOCK <<"END2";
273return <<'END';
274$code $last
275END
276END2
277 close BLOCK;
278 }
279}
280
281print OUT "END\n";
282close OUT;
283
284##################################################
285
286sub proplist {
287 my ($table, $wanted, $val) = @_;
288 my @wanted;
289 my $out;
290 my $split;
291
7c6f5cd2
GS
292 return listFromPropFile($wanted) if $val eq $PropData;
293
a0ed51b3 294 if ($table =~ /^Arab/) {
d357d9fe 295 open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
a0ed51b3
LW
296
297 $split = '($code, $name, $link, $linkgroup) = split(/; */);';
298 }
299 elsif ($table =~ /^Jamo/) {
505afebf 300 open(UD, "Jamo.txt") or warn "Can't open $table: $!";
a0ed51b3
LW
301
302 $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
303 }
499bfa7a 304 elsif ($table =~ /^IsSyl/) {
7c6f5cd2 305 open(UD, $SyllableData) or warn "Can't open $table: $!";
499bfa7a
JH
306
307 $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
308 }
a77b4ae5
GS
309 elsif ($table =~ /^IsLbrk/) {
310 open(UD, "LineBrk.txt") or warn "Can't open $table: $!";
311
312 $split = '($code, $brk, $name) = split(/;/);';
313 }
a0ed51b3 314 else {
11695a73 315 open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
a0ed51b3
LW
316
317 $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
318 $comment, $up, $down, $title) = split(/;/);';
319 }
320
321 if ($table =~ /^(?:To|Is)[A-Z]/) {
322 eval <<"END";
323 while (<UD>) {
324 next if /^#/;
7c6f5cd2
GS
325 next if /^\\s/;
326 s/\\s+\$//;
a0ed51b3
LW
327 $split
328 if ($wanted) {
329 push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
330 }
331 }
332END
333 die $@ if $@;
334
335 while (@wanted) {
336 $beg = shift @wanted;
337 $last = $beg;
338 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
339 (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
340 $last = shift @wanted;
341 }
342 $out .= sprintf "%04x", $beg->[0];
343 if ($beg->[2]) {
344 $last = shift @wanted;
345 }
346 if ($beg == $last) {
347 $out .= "\t";
348 }
349 else {
350 $out .= sprintf "\t%04x", $last->[0];
351 }
352 $out .= sprintf "\t%04x", $beg->[1] if $val;
353 $out .= "\n";
354 }
355 }
356 else {
357 eval <<"END";
358 while (<UD>) {
359 next if /^#/;
7c6f5cd2 360 next if /^\\s*\$/;
a0ed51b3
LW
361 chop;
362 $split
363 if ($wanted) {
364 push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
365 }
366 }
367END
368 die $@ if $@;
369
370 while (@wanted) {
371 $beg = shift @wanted;
372 $last = $beg;
373 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
374 ($wanted[0]->[1] eq $last->[1])) {
375 $last = shift @wanted;
376 }
377 $out .= sprintf "%04x", $beg->[0];
378 if ($beg->[2]) {
379 $last = shift @wanted;
380 }
381 if ($beg == $last) {
382 $out .= "\t";
383 }
384 else {
385 $out .= sprintf "\t%04x", $last->[0];
386 }
387 $out .= sprintf "\t%s\n", $beg->[1];
388 }
389 }
390 $out;
391}
11695a73 392
7c6f5cd2
GS
393sub listFromPropFile {
394 my ($wanted) = @_;
395 my $out;
396
397 open (UD, $PropData) or die "Can't open $PropData: $!\n";
398 local($/) = "\n" . '*' x 43 . "\n\nProperty dump for:"; # not 42?
399
400 <UD>;
401 while (<UD>) {
402 chomp;
403 if (s/0x[\d\w]+\s+\((.*?)\)// and $wanted eq $1) {
404 s/\(\d+ chars\)//g;
405 s/^\s+//mg;
406 s/\s+$//mg;
407 s/\.\./\t/g;
408 $out = lc $_;
409 last;
410 }
411 }
412 close (UD);
413 "$out\n";
414}
415
416sub syllable_defs {
417 my @defs;
418 my %seen;
419
420 open (SD, $SyllableData) or die "Can't open $SyllableData: $!\n";
421 while (<SD>) {
422 next if /^\s*(#|$)/;
423 s/\s+$//;
424 ($code, $name, $syl) = split /; */;
425 next unless $syl;
426 push (@defs, ["IsSyl$syl", qq{\$syl eq "$syl"}, ''])
427 unless $seen{$syl}++;
428 }
429 close (SD);
430 return (@defs);
431}
432
6dd159d1 433# eof