This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
integrate changes#7205..7210,7212,7214..7219,7222,7223,7225,7226,
[perl5.git] / lib / unicode / mktables.PL
CommitLineData
a0ed51b3
LW
1#!../../miniperl
2
7c6f5cd2
GS
3use bytes;
4
1e8df282 5$UnicodeData = "Unicode.301";
7c6f5cd2 6$SyllableData = "syllables.txt";
1e8df282 7$PropData = "PropList.txt";
7c6f5cd2 8
11695a73 9
a0ed51b3
LW
10# Note: we try to keep filenames unique within first 8 chars. Using
11# subdirectories for the following helps.
d6f4130b
GS
12mkdir "In", 0755;
13mkdir "Is", 0755;
14mkdir "To", 0755;
a0ed51b3
LW
15
16@todo = (
17# typical
18
e2439105
GS
19 # 005F: SPACING UNDERSCROE
20 ['IsWord', '$cat =~ /^[LMN]/ or $code eq "005F"', ''],
21 ['IsAlnum', '$cat =~ /^[LMN]/', ''],
22 ['IsAlpha', '$cat =~ /^[LM]/', ''],
23 # 0009: HORIZONTAL TABULATION
24 # 000A: LINE FEED
25 # 000B: VERTICAL TABULATION
26 # 000C: FORM FEED
27 # 000D: CARRIAGE RETURN
28 ['IsSpace', '$cat =~ /^Z/ ||
29 $code =~ /^(0009|000A|000B|000C|000D)$/', ''],
b8c5462f 30 ['IsDigit', '$cat =~ /^Nd$/', ''],
7c6f5cd2 31 ['IsUpper', '$cat =~ /^L[ut]$/', ''],
b8c5462f 32 ['IsLower', '$cat =~ /^Ll$/', ''],
e2439105 33 ['IsASCII', '$code le "007f"', ''],
b8c5462f 34 ['IsCntrl', '$cat =~ /^C/', ''],
e2439105
GS
35 ['IsGraph', '$cat =~ /^([LMNPS]|Co)/', ''],
36 ['IsPrint', '$cat =~ /^([LMNPS]|Co|Zs)/', ''],
37 ['IsPunct', '$cat =~ /^P/', ''],
38 # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
b8c5462f
JH
39 ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
40 ['ToUpper', '$up', '$up'],
41 ['ToLower', '$down', '$down'],
42 ['ToTitle', '$title', '$title'],
43 ['ToDigit', '$dec ne ""', '$dec'],
a0ed51b3
LW
44
45# Name
46
47 ['Name', '$name', '$name'],
48
49# Category
50
51 ['Category', '$cat', '$cat'],
52
53# Normative
54
55 ['IsM', '$cat =~ /^M/', ''], # Mark
56 ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
57 ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
7c6f5cd2 58 ['IsMe', '$cat eq "Me"', ''], # Mark, Enclosing
a0ed51b3
LW
59
60 ['IsN', '$cat =~ /^N/', ''], # Number
61 ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
62 ['IsNo', '$cat eq "No"', ''], # Number, Other
7c6f5cd2 63 ['IsNl', '$cat eq "Nl"', ''], # Number, Letter
a0ed51b3 64
7c6f5cd2 65 ['IsZ', '$cat =~ /^Z/', ''], # Separator
a0ed51b3
LW
66 ['IsZs', '$cat eq "Zs"', ''], # Separator, Space
67 ['IsZl', '$cat eq "Zl"', ''], # Separator, Line
68 ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
69
70 ['IsC', '$cat =~ /^C/', ''], # Crazy
71 ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
72 ['IsCo', '$cat eq "Co"', ''], # Other, Private Use
73 ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
7c6f5cd2
GS
74 ['IsCf', '$cat eq "Cf"', ''], # Other, Format
75 ['IsCs', '$cat eq "Cs"', ''], # Other, Surrogate
76 ['IsCn', 'Unassigned Code Value',$PropData], # Other, Not Assigned
a0ed51b3
LW
77
78# Informative
79
80 ['IsL', '$cat =~ /^L/', ''], # Letter
81 ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
82 ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
83 ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
84 ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
85 ['IsLo', '$cat eq "Lo"', ''], # Letter, Other
86
87 ['IsP', '$cat =~ /^P/', ''], # Punctuation
88 ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
89 ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
90 ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
91 ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
7c6f5cd2
GS
92 ['IsPc', '$cat eq "Pc"', ''], # Punctuation, Connector
93 ['IsPi', '$cat eq "Pi"', ''], # Punctuation, Initial quote
94 ['IsPf', '$cat eq "Pf"', ''], # Punctuation, Final quote
a0ed51b3
LW
95
96 ['IsS', '$cat =~ /^S/', ''], # Symbol
97 ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
7c6f5cd2 98 ['IsSk', '$cat eq "Sk"', ''], # Symbol, Modifier
a0ed51b3
LW
99 ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
100 ['IsSo', '$cat eq "So"', ''], # Symbol, Other
101
102# Combining class
103 ['CombiningClass', '$comb', '$comb'],
104
105# BIDIRECTIONAL PROPERTIES
106
107 ['Bidirectional', '$bid', '$bid'],
108
109# Strong types:
110
111 ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
112 # syllabic, and logographic
113 # characters (e.g., CJK
114 # ideographs)
115 ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
116 # and punctuation specific to
117 # those scripts
118
7c6f5cd2
GS
119 ['IsBidiLRE', '$bid eq "LRE"', ''], # Left-to-Right Embedding
120 ['IsBidiLRO', '$bid eq "LRO"', ''], # Left-to-Right Override
121 ['IsBidiAL', '$bid eq "AL"', ''], # Right-to-Left Arabic
122 ['IsBidiRLE', '$bid eq "RLE"', ''], # Right-to-Left Embedding
123 ['IsBidiRLO', '$bid eq "RLO"', ''], # Right-to-Left Override
124 ['IsBidiPDF', '$bid eq "PDF"', ''], # Pop Directional Format
125 ['IsBidiNSM', '$bid eq "NSM"', ''], # Non-Spacing Mark
126 ['IsBidiBN', '$bid eq "BN"', ''], # Boundary Neutral
127
a0ed51b3
LW
128# Weak types:
129
130 ['IsBidiEN','$bid eq "EN"', ''], # European Number
131 ['IsBidiES','$bid eq "ES"', ''], # European Number Separator
132 ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
133 ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
134 ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
135
136# Separators:
137
138 ['IsBidiB', '$bid eq "B"', ''], # Block Separator
139 ['IsBidiS', '$bid eq "S"', ''], # Segment Separator
140
141# Neutrals:
142
143 ['IsBidiWS','$bid eq "WS"', ''], # Whitespace
144 ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
145 # characters: punctuation,
146 # symbols
147
148# Decomposition
149
150 ['Decomposition', '$decomp', '$decomp'],
151 ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
152 ['IsDecoCompat', '$decomp =~ /^</', ''],
153 ['IsDCfont', '$decomp =~ /^<font>/', ''],
154 ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
155 ['IsDCinitial', '$decomp =~ /^<initial>/', ''],
e2439105 156 ['IsDCmedial', '$decomp =~ /^<medial>/', ''],
a0ed51b3
LW
157 ['IsDCfinal', '$decomp =~ /^<final>/', ''],
158 ['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
159 ['IsDCcircle', '$decomp =~ /^<circle>/', ''],
160 ['IsDCsuper', '$decomp =~ /^<super>/', ''],
161 ['IsDCsub', '$decomp =~ /^<sub>/', ''],
162 ['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
163 ['IsDCwide', '$decomp =~ /^<wide>/', ''],
164 ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
165 ['IsDCsmall', '$decomp =~ /^<small>/', ''],
166 ['IsDCsquare', '$decomp =~ /^<square>/', ''],
7c6f5cd2 167 ['IsDCfraction', '$decomp =~ /^<fraction>/', ''],
a0ed51b3
LW
168 ['IsDCcompat', '$decomp =~ /^<compat>/', ''],
169
170# Number
171
0139fc7d 172 ['Number', '$num ne ""', '$num'],
a0ed51b3
LW
173
174# Mirrored
175
176 ['IsMirrored', '$mir eq "Y"', ''],
177
178# Arabic
179
180 ['ArabLink', '1', '$link'],
181 ['ArabLnkGrp', '1', '$linkgroup'],
182
183# Jamo
184
185 ['JamoShort', '1', '$short'],
499bfa7a
JH
186
187# Syllables
188
7c6f5cd2
GS
189 syllable_defs(),
190
a77b4ae5
GS
191# Line break properties - Normative
192
193 ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break
194 ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return
195 ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed
196 ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks
197 ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates
198 ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue)
199 ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity
200 ['IsLbrkSP','$brk eq "SP"', ''], # Space
201 ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space
202
203# Line break properties - Informative
204 ['IsLbrkXX','$brk eq "XX"', ''], # Unknown
205 ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation
206 ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation
207 ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation
208 ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter
209 ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation
210 ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks
211 ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric)
212 ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric)
213 ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric)
214 ['IsLbrkNU','$brk eq "NU"', ''], # Numeric
215 ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters
216 ['IsLbrkID','$brk eq "ID"', ''], # Ideographic
217 ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable
218 ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen
219 ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before
220 ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After
221 ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian)
222 ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic)
223 ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After
a0ed51b3
LW
224);
225
226# This is not written for speed...
227
228foreach $file (@todo) {
229 my ($table, $wanted, $val) = @$file;
230 next if @ARGV and not grep { $_ eq $table } @ARGV;
231 print $table,"\n";
232 if ($table =~ /^(Is|In|To)(.*)/) {
233 open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
234 }
235 else {
236 open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
237 }
14055466
JH
238 print OUT <<EOH;
239# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
240# This file is built by $0 from e.g. $UnicodeData.
241# Any changes made here will be lost!
242EOH
a0ed51b3
LW
243 print OUT <<"END";
244return <<'END';
245END
246 print OUT proplist($table, $wanted, $val);
247 print OUT "END\n";
248 close OUT;
249}
250
251# Must treat blocks specially.
252
253exit if @ARGV and not grep { $_ eq Block } @ARGV;
254print "Block\n";
7c6f5cd2
GS
255open(UD, 'Blocks.txt') or die "Can't open Blocks.txt: $!\n";
256open(OUT, ">Block.pl") or die "Can't create Block.pl: $!\n";
14055466
JH
257print OUT <<EOH;
258# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
259# This file is built by $0 from e.g. $UnicodeData.
260# Any changes made here will be lost!
261EOH
a0ed51b3
LW
262print OUT <<"END";
263return <<'END';
264END
265
266while (<UD>) {
267 next if /^#/;
268 next if /^$/;
269 chomp;
270 ($code, $last, $name) = split(/; */);
271 if ($name) {
272 print OUT "$code $last $name\n";
273 $name =~ s/\s+//g;
274 open(BLOCK, ">In/$name.pl");
14055466
JH
275 print BLOCK <<EOH;
276# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
277# This file is built by $0 from e.g. $UnicodeData.
278# Any changes made here will be lost!
279EOH
a0ed51b3
LW
280 print BLOCK <<"END2";
281return <<'END';
282$code $last
283END
284END2
285 close BLOCK;
286 }
287}
288
289print OUT "END\n";
290close OUT;
291
292##################################################
293
294sub proplist {
295 my ($table, $wanted, $val) = @_;
296 my @wanted;
297 my $out;
298 my $split;
299
7c6f5cd2
GS
300 return listFromPropFile($wanted) if $val eq $PropData;
301
a0ed51b3 302 if ($table =~ /^Arab/) {
d357d9fe 303 open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
a0ed51b3
LW
304
305 $split = '($code, $name, $link, $linkgroup) = split(/; */);';
306 }
307 elsif ($table =~ /^Jamo/) {
505afebf 308 open(UD, "Jamo.txt") or warn "Can't open $table: $!";
a0ed51b3
LW
309
310 $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
311 }
499bfa7a 312 elsif ($table =~ /^IsSyl/) {
7c6f5cd2 313 open(UD, $SyllableData) or warn "Can't open $table: $!";
499bfa7a
JH
314
315 $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
316 }
a77b4ae5
GS
317 elsif ($table =~ /^IsLbrk/) {
318 open(UD, "LineBrk.txt") or warn "Can't open $table: $!";
319
320 $split = '($code, $brk, $name) = split(/;/);';
321 }
a0ed51b3 322 else {
11695a73 323 open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
a0ed51b3
LW
324
325 $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
326 $comment, $up, $down, $title) = split(/;/);';
327 }
328
329 if ($table =~ /^(?:To|Is)[A-Z]/) {
330 eval <<"END";
331 while (<UD>) {
332 next if /^#/;
7c6f5cd2
GS
333 next if /^\\s/;
334 s/\\s+\$//;
a0ed51b3
LW
335 $split
336 if ($wanted) {
337 push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
338 }
339 }
340END
341 die $@ if $@;
342
343 while (@wanted) {
344 $beg = shift @wanted;
345 $last = $beg;
346 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
347 (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
348 $last = shift @wanted;
349 }
350 $out .= sprintf "%04x", $beg->[0];
351 if ($beg->[2]) {
352 $last = shift @wanted;
353 }
354 if ($beg == $last) {
355 $out .= "\t";
356 }
357 else {
358 $out .= sprintf "\t%04x", $last->[0];
359 }
360 $out .= sprintf "\t%04x", $beg->[1] if $val;
361 $out .= "\n";
362 }
363 }
364 else {
365 eval <<"END";
366 while (<UD>) {
367 next if /^#/;
7c6f5cd2 368 next if /^\\s*\$/;
a0ed51b3
LW
369 chop;
370 $split
371 if ($wanted) {
372 push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
373 }
374 }
375END
376 die $@ if $@;
377
378 while (@wanted) {
379 $beg = shift @wanted;
380 $last = $beg;
381 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
382 ($wanted[0]->[1] eq $last->[1])) {
383 $last = shift @wanted;
384 }
385 $out .= sprintf "%04x", $beg->[0];
386 if ($beg->[2]) {
387 $last = shift @wanted;
388 }
389 if ($beg == $last) {
390 $out .= "\t";
391 }
392 else {
393 $out .= sprintf "\t%04x", $last->[0];
394 }
395 $out .= sprintf "\t%s\n", $beg->[1];
396 }
397 }
398 $out;
399}
11695a73 400
7c6f5cd2
GS
401sub listFromPropFile {
402 my ($wanted) = @_;
403 my $out;
404
405 open (UD, $PropData) or die "Can't open $PropData: $!\n";
406 local($/) = "\n" . '*' x 43 . "\n\nProperty dump for:"; # not 42?
407
408 <UD>;
409 while (<UD>) {
410 chomp;
411 if (s/0x[\d\w]+\s+\((.*?)\)// and $wanted eq $1) {
412 s/\(\d+ chars\)//g;
413 s/^\s+//mg;
414 s/\s+$//mg;
415 s/\.\./\t/g;
416 $out = lc $_;
417 last;
418 }
419 }
420 close (UD);
421 "$out\n";
422}
423
424sub syllable_defs {
425 my @defs;
426 my %seen;
427
428 open (SD, $SyllableData) or die "Can't open $SyllableData: $!\n";
429 while (<SD>) {
430 next if /^\s*(#|$)/;
431 s/\s+$//;
432 ($code, $name, $syl) = split /; */;
433 next unless $syl;
434 push (@defs, ["IsSyl$syl", qq{\$syl eq "$syl"}, ''])
435 unless $seen{$syl}++;
436 }
437 close (SD);
438 return (@defs);
439}
440
6dd159d1 441# eof