This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
The perlretut was still talking about the old \p and \P
[perl5.git] / lib / unicode / mktables.PL
CommitLineData
a0ed51b3
LW
1#!../../miniperl
2
7c6f5cd2
GS
3use bytes;
4
190eec7c 5$UnicodeData = "Unicode.txt";
7c6f5cd2 6$SyllableData = "syllables.txt";
8836d2a5 7$PropData = "PropList.txt";
7c6f5cd2 8
11695a73 9
a0ed51b3
LW
10# Note: we try to keep filenames unique within first 8 chars. Using
11# subdirectories for the following helps.
289d4f09
ML
12mkdir "In", 0755;
13mkdir "Is", 0755;
14mkdir "To", 0755;
a0ed51b3
LW
15
16@todo = (
17# typical
18
f59877d4
JH
19 # 005F: SPACING UNDERSCROE
20 ['IsWord', '$cat =~ /^[LMN]/ or $code eq "005F"', ''],
21 ['IsAlnum', '$cat =~ /^[LMN]/', ''],
22 ['IsAlpha', '$cat =~ /^[LM]/', ''],
23 # 0009: HORIZONTAL TABULATION
24 # 000A: LINE FEED
25 # 000B: VERTICAL TABULATION
26 # 000C: FORM FEED
27 # 000D: CARRIAGE RETURN
3bec3564 28 # 0020: SPACE
f59877d4
JH
29 ['IsSpace', '$cat =~ /^Z/ ||
30 $code =~ /^(0009|000A|000B|000C|000D)$/', ''],
3bec3564
JH
31 ['IsSpacePerl',
32 '$cat =~ /^Z/ ||
33 $code =~ /^(0009|000A|000C|000D)$/', ''],
34 ['IsBlank', '$cat =~ /^Z[^lp]$/ || $code eq "0009"', ''],
b8c5462f 35 ['IsDigit', '$cat =~ /^Nd$/', ''],
7c6f5cd2 36 ['IsUpper', '$cat =~ /^L[ut]$/', ''],
b8c5462f 37 ['IsLower', '$cat =~ /^Ll$/', ''],
f59877d4 38 ['IsASCII', '$code le "007f"', ''],
b8c5462f 39 ['IsCntrl', '$cat =~ /^C/', ''],
f59877d4
JH
40 ['IsGraph', '$cat =~ /^([LMNPS]|Co)/', ''],
41 ['IsPrint', '$cat =~ /^([LMNPS]|Co|Zs)/', ''],
42 ['IsPunct', '$cat =~ /^P/', ''],
43 # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
b8c5462f
JH
44 ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
45 ['ToUpper', '$up', '$up'],
46 ['ToLower', '$down', '$down'],
47 ['ToTitle', '$title', '$title'],
48 ['ToDigit', '$dec ne ""', '$dec'],
a0ed51b3
LW
49
50# Name
51
52 ['Name', '$name', '$name'],
53
54# Category
55
56 ['Category', '$cat', '$cat'],
57
58# Normative
59
60 ['IsM', '$cat =~ /^M/', ''], # Mark
61 ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
62 ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
7c6f5cd2 63 ['IsMe', '$cat eq "Me"', ''], # Mark, Enclosing
a0ed51b3
LW
64
65 ['IsN', '$cat =~ /^N/', ''], # Number
66 ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
67 ['IsNo', '$cat eq "No"', ''], # Number, Other
7c6f5cd2 68 ['IsNl', '$cat eq "Nl"', ''], # Number, Letter
a0ed51b3 69
7c6f5cd2 70 ['IsZ', '$cat =~ /^Z/', ''], # Separator
a0ed51b3
LW
71 ['IsZs', '$cat eq "Zs"', ''], # Separator, Space
72 ['IsZl', '$cat eq "Zl"', ''], # Separator, Line
73 ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
74
75 ['IsC', '$cat =~ /^C/', ''], # Crazy
76 ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
77 ['IsCo', '$cat eq "Co"', ''], # Other, Private Use
78 ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
7c6f5cd2
GS
79 ['IsCf', '$cat eq "Cf"', ''], # Other, Format
80 ['IsCs', '$cat eq "Cs"', ''], # Other, Surrogate
81 ['IsCn', 'Unassigned Code Value',$PropData], # Other, Not Assigned
a0ed51b3
LW
82
83# Informative
84
85 ['IsL', '$cat =~ /^L/', ''], # Letter
86 ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
87 ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
88 ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
89 ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
90 ['IsLo', '$cat eq "Lo"', ''], # Letter, Other
91
92 ['IsP', '$cat =~ /^P/', ''], # Punctuation
93 ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
94 ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
95 ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
96 ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
7c6f5cd2
GS
97 ['IsPc', '$cat eq "Pc"', ''], # Punctuation, Connector
98 ['IsPi', '$cat eq "Pi"', ''], # Punctuation, Initial quote
99 ['IsPf', '$cat eq "Pf"', ''], # Punctuation, Final quote
a0ed51b3
LW
100
101 ['IsS', '$cat =~ /^S/', ''], # Symbol
102 ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
7c6f5cd2 103 ['IsSk', '$cat eq "Sk"', ''], # Symbol, Modifier
a0ed51b3
LW
104 ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
105 ['IsSo', '$cat eq "So"', ''], # Symbol, Other
106
107# Combining class
108 ['CombiningClass', '$comb', '$comb'],
109
110# BIDIRECTIONAL PROPERTIES
111
112 ['Bidirectional', '$bid', '$bid'],
113
114# Strong types:
115
116 ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
117 # syllabic, and logographic
118 # characters (e.g., CJK
119 # ideographs)
120 ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
121 # and punctuation specific to
122 # those scripts
123
7c6f5cd2
GS
124 ['IsBidiLRE', '$bid eq "LRE"', ''], # Left-to-Right Embedding
125 ['IsBidiLRO', '$bid eq "LRO"', ''], # Left-to-Right Override
126 ['IsBidiAL', '$bid eq "AL"', ''], # Right-to-Left Arabic
127 ['IsBidiRLE', '$bid eq "RLE"', ''], # Right-to-Left Embedding
128 ['IsBidiRLO', '$bid eq "RLO"', ''], # Right-to-Left Override
129 ['IsBidiPDF', '$bid eq "PDF"', ''], # Pop Directional Format
130 ['IsBidiNSM', '$bid eq "NSM"', ''], # Non-Spacing Mark
131 ['IsBidiBN', '$bid eq "BN"', ''], # Boundary Neutral
132
a0ed51b3
LW
133# Weak types:
134
135 ['IsBidiEN','$bid eq "EN"', ''], # European Number
136 ['IsBidiES','$bid eq "ES"', ''], # European Number Separator
137 ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
138 ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
139 ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
140
141# Separators:
142
143 ['IsBidiB', '$bid eq "B"', ''], # Block Separator
144 ['IsBidiS', '$bid eq "S"', ''], # Segment Separator
145
146# Neutrals:
147
148 ['IsBidiWS','$bid eq "WS"', ''], # Whitespace
149 ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
150 # characters: punctuation,
151 # symbols
152
153# Decomposition
154
155 ['Decomposition', '$decomp', '$decomp'],
156 ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
157 ['IsDecoCompat', '$decomp =~ /^</', ''],
158 ['IsDCfont', '$decomp =~ /^<font>/', ''],
159 ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
160 ['IsDCinitial', '$decomp =~ /^<initial>/', ''],
f59877d4 161 ['IsDCmedial', '$decomp =~ /^<medial>/', ''],
a0ed51b3
LW
162 ['IsDCfinal', '$decomp =~ /^<final>/', ''],
163 ['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
164 ['IsDCcircle', '$decomp =~ /^<circle>/', ''],
165 ['IsDCsuper', '$decomp =~ /^<super>/', ''],
166 ['IsDCsub', '$decomp =~ /^<sub>/', ''],
167 ['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
168 ['IsDCwide', '$decomp =~ /^<wide>/', ''],
169 ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
170 ['IsDCsmall', '$decomp =~ /^<small>/', ''],
171 ['IsDCsquare', '$decomp =~ /^<square>/', ''],
7c6f5cd2 172 ['IsDCfraction', '$decomp =~ /^<fraction>/', ''],
a0ed51b3
LW
173 ['IsDCcompat', '$decomp =~ /^<compat>/', ''],
174
175# Number
176
e1b504a6 177 ['Number', '$num ne ""', '$num'],
a0ed51b3
LW
178
179# Mirrored
180
181 ['IsMirrored', '$mir eq "Y"', ''],
182
183# Arabic
184
185 ['ArabLink', '1', '$link'],
186 ['ArabLnkGrp', '1', '$linkgroup'],
187
188# Jamo
189
190 ['JamoShort', '1', '$short'],
499bfa7a
JH
191
192# Syllables
193
7c6f5cd2
GS
194 syllable_defs(),
195
a77b4ae5
GS
196# Line break properties - Normative
197
198 ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break
199 ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return
200 ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed
201 ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks
202 ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates
203 ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue)
204 ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity
205 ['IsLbrkSP','$brk eq "SP"', ''], # Space
206 ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space
207
208# Line break properties - Informative
209 ['IsLbrkXX','$brk eq "XX"', ''], # Unknown
210 ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation
211 ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation
212 ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation
213 ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter
214 ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation
215 ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks
216 ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric)
217 ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric)
218 ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric)
219 ['IsLbrkNU','$brk eq "NU"', ''], # Numeric
220 ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters
221 ['IsLbrkID','$brk eq "ID"', ''], # Ideographic
222 ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable
223 ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen
224 ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before
225 ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After
226 ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian)
227 ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic)
228 ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After
a0ed51b3
LW
229);
230
231# This is not written for speed...
232
233foreach $file (@todo) {
234 my ($table, $wanted, $val) = @$file;
235 next if @ARGV and not grep { $_ eq $table } @ARGV;
236 print $table,"\n";
237 if ($table =~ /^(Is|In|To)(.*)/) {
238 open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
239 }
240 else {
241 open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
242 }
14055466
JH
243 print OUT <<EOH;
244# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
245# This file is built by $0 from e.g. $UnicodeData.
246# Any changes made here will be lost!
247EOH
a0ed51b3
LW
248 print OUT <<"END";
249return <<'END';
250END
251 print OUT proplist($table, $wanted, $val);
252 print OUT "END\n";
253 close OUT;
254}
255
256# Must treat blocks specially.
257
258exit if @ARGV and not grep { $_ eq Block } @ARGV;
259print "Block\n";
7c6f5cd2
GS
260open(UD, 'Blocks.txt') or die "Can't open Blocks.txt: $!\n";
261open(OUT, ">Block.pl") or die "Can't create Block.pl: $!\n";
14055466
JH
262print OUT <<EOH;
263# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
264# This file is built by $0 from e.g. $UnicodeData.
265# Any changes made here will be lost!
266EOH
a0ed51b3
LW
267print OUT <<"END";
268return <<'END';
269END
270
271while (<UD>) {
272 next if /^#/;
273 next if /^$/;
274 chomp;
275 ($code, $last, $name) = split(/; */);
276 if ($name) {
277 print OUT "$code $last $name\n";
278 $name =~ s/\s+//g;
279 open(BLOCK, ">In/$name.pl");
14055466
JH
280 print BLOCK <<EOH;
281# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
282# This file is built by $0 from e.g. $UnicodeData.
283# Any changes made here will be lost!
284EOH
a0ed51b3
LW
285 print BLOCK <<"END2";
286return <<'END';
287$code $last
288END
289END2
290 close BLOCK;
291 }
292}
293
294print OUT "END\n";
295close OUT;
296
297##################################################
298
299sub proplist {
300 my ($table, $wanted, $val) = @_;
301 my @wanted;
302 my $out;
303 my $split;
304
7c6f5cd2
GS
305 return listFromPropFile($wanted) if $val eq $PropData;
306
a0ed51b3 307 if ($table =~ /^Arab/) {
d357d9fe 308 open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
a0ed51b3
LW
309
310 $split = '($code, $name, $link, $linkgroup) = split(/; */);';
311 }
312 elsif ($table =~ /^Jamo/) {
505afebf 313 open(UD, "Jamo.txt") or warn "Can't open $table: $!";
a0ed51b3
LW
314
315 $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
316 }
499bfa7a 317 elsif ($table =~ /^IsSyl/) {
7c6f5cd2 318 open(UD, $SyllableData) or warn "Can't open $table: $!";
499bfa7a
JH
319
320 $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
321 }
a77b4ae5
GS
322 elsif ($table =~ /^IsLbrk/) {
323 open(UD, "LineBrk.txt") or warn "Can't open $table: $!";
324
325 $split = '($code, $brk, $name) = split(/;/);';
326 }
a0ed51b3 327 else {
11695a73 328 open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
a0ed51b3
LW
329
330 $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
331 $comment, $up, $down, $title) = split(/;/);';
332 }
333
334 if ($table =~ /^(?:To|Is)[A-Z]/) {
335 eval <<"END";
336 while (<UD>) {
337 next if /^#/;
7c6f5cd2
GS
338 next if /^\\s/;
339 s/\\s+\$//;
a0ed51b3
LW
340 $split
341 if ($wanted) {
342 push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
343 }
344 }
345END
346 die $@ if $@;
347
348 while (@wanted) {
349 $beg = shift @wanted;
350 $last = $beg;
351 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
352 (not $val or $wanted[0]->[1] == $last->[1] + 1)) {
353 $last = shift @wanted;
354 }
355 $out .= sprintf "%04x", $beg->[0];
356 if ($beg->[2]) {
357 $last = shift @wanted;
358 }
359 if ($beg == $last) {
360 $out .= "\t";
361 }
362 else {
363 $out .= sprintf "\t%04x", $last->[0];
364 }
365 $out .= sprintf "\t%04x", $beg->[1] if $val;
366 $out .= "\n";
367 }
368 }
369 else {
370 eval <<"END";
371 while (<UD>) {
372 next if /^#/;
7c6f5cd2 373 next if /^\\s*\$/;
a0ed51b3
LW
374 chop;
375 $split
376 if ($wanted) {
377 push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
378 }
379 }
380END
381 die $@ if $@;
382
383 while (@wanted) {
384 $beg = shift @wanted;
385 $last = $beg;
386 while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
387 ($wanted[0]->[1] eq $last->[1])) {
388 $last = shift @wanted;
389 }
390 $out .= sprintf "%04x", $beg->[0];
391 if ($beg->[2]) {
392 $last = shift @wanted;
393 }
394 if ($beg == $last) {
395 $out .= "\t";
396 }
397 else {
398 $out .= sprintf "\t%04x", $last->[0];
399 }
400 $out .= sprintf "\t%s\n", $beg->[1];
401 }
402 }
403 $out;
404}
11695a73 405
7c6f5cd2
GS
406sub listFromPropFile {
407 my ($wanted) = @_;
408 my $out;
409
410 open (UD, $PropData) or die "Can't open $PropData: $!\n";
411 local($/) = "\n" . '*' x 43 . "\n\nProperty dump for:"; # not 42?
412
413 <UD>;
414 while (<UD>) {
415 chomp;
416 if (s/0x[\d\w]+\s+\((.*?)\)// and $wanted eq $1) {
417 s/\(\d+ chars\)//g;
418 s/^\s+//mg;
419 s/\s+$//mg;
420 s/\.\./\t/g;
421 $out = lc $_;
422 last;
423 }
424 }
425 close (UD);
426 "$out\n";
427}
428
429sub syllable_defs {
430 my @defs;
431 my %seen;
432
433 open (SD, $SyllableData) or die "Can't open $SyllableData: $!\n";
434 while (<SD>) {
435 next if /^\s*(#|$)/;
436 s/\s+$//;
437 ($code, $name, $syl) = split /; */;
438 next unless $syl;
439 push (@defs, ["IsSyl$syl", qq{\$syl eq "$syl"}, ''])
440 unless $seen{$syl}++;
441 }
442 close (SD);
443 return (@defs);
444}
445
6dd159d1 446# eof