This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Rewrite mktables from scratch.
[perl5.git] / lib / unicore / mktables
CommitLineData
d73e5302
JH
1#!/usr/bin/perl -w
2
3#
4# mktables -- create the runtime Perl Unicode files (lib/unicore/**/*.pl)
5# from the Unicode database files (lib/unicore/*.txt).
6#
7
8my $LastUnicodeCodepoint = 0x10FFFF; # As of Unicode 3.1.1.
9
10use strict;
11
12mkdir("In", 0755);
13mkdir("Is", 0755);
14mkdir("To", 0755);
15
16sub extend {
17 my ($table, $last) = @_;
18
19 $table->[-1]->[1] = $last;
20}
21
22sub append {
23 my ($table, $code, $name) = @_;
24 if (@$table &&
25 hex($table->[-1]->[1]) == hex($code) - 1 &&
26 (!defined $name || $table->[-1]->[2] eq $name)) {
27 extend($table, $code);
28 } else {
29 push @$table, [$code, $code, $name];
30 }
31}
32
33sub inverse {
34 my ($table) = @_;
35 my $inverse = [];
36 my ($first, $last);
37 if ($table->[0]->[0]) {
38 $last = hex($table->[0]->[0]);
39 push @$inverse, [ "0000",
40 sprintf("%04X", $last - 1) ];
41 }
42 for my $i (0..$#$table-1) {
43 $first = defined $table->[$i ]->[1] ?
44 hex($table->[$i ]->[1]) : 0;
45 $last = defined $table->[$i + 1]->[0] ?
46 hex($table->[$i + 1]->[0]) : $first;
47 push @$inverse, [ sprintf("%04X", $first + 1),
48 sprintf("%04X", $last - 1) ]
49 unless $first + 1 == $last;
50 }
51 return $inverse;
52}
53
54sub header {
55 my $fh = shift;
56
57 print $fh <<EOT;
58# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
59# This file is built by $0 from e.g. Unicode.txt.
60# Any changes made here will be lost!
61EOT
62}
63
64sub begin {
65 my $fh = shift;
66
67 print $fh <<EOT;
68return <<'END';
69EOT
70}
71
72sub end {
73 my $fh = shift;
74
75 print $fh <<EOT;
76END
77EOT
78}
79
80sub flush {
81 my ($table, $file) = @_;
82 print "$file\n";
83 if (open(my $fh, ">$file")) {
84 header($fh);
85 begin($fh);
86 for my $i (@$table) {
87 print $fh $i->[0], "\t",
88 $i->[1] ne $i->[0] ? $i->[1] : "", "\t",
89 defined $i->[2] ? $i->[2] : "", "\n";
90 }
91 end($fh);
92 close($fh);
93 } else {
94 die "$0: $file: $!\n";
95 }
96}
97
98#
99# The %In contains the mapping of the script/block name into a number.
100#
101
102my %In;
103my $InId = 0;
104my %InIn;
105
106#
107# Read in the Unicode.txt, the main Unicode database.
108#
109
110my %Cat;
111my %General;
112my @General;
113
114if (open(my $Unicode, "Unicode.txt")) {
115 my @Name;
116 my @Bidi;
117 my %Bidi;
118 my @Comb;
119 my @Deco;
120 my %Deco;
121 my %DC;
122 my @Number;
123 my @Mirrored;
124 my %To;
125 while (<$Unicode>) {
126 next if /^\#/ || /^\s*$/;
127 next unless /^[0-9a-f]+\s*;/i;
128 s/\s+$//;
129 my ($code, $name, $cat, $comb, $bidi, $deco,
130 $decimal, $digit, $number,
131 $mirrored, $unicode10, $comment,
132 $upper, $lower, $title) = split(/\s*;\s*/);
133
134 if ($name =~ /^<(.+), (First|Last)>$/) {
135 $name = $1;
136 if ($2 eq 'First') {
137 append($General{$name} ||= [], $code, $name);
138 } else {
139 extend($General{$name} , $code);
140 }
141 unless (defined $In{$name}) {
142 $In{$name} = $InId++;
143 $InIn{$name} = $General{$name};
144 }
145 append($Cat{$cat} ||= [], $code);
146 append($Cat{substr($cat, 0, 1)}
147 ||= [], $code);
148 } else {
149 append(\@Name, $code, $name);
150
151 append(\@General, $code, $cat);
152
153 append($Cat{$cat} ||= [], $code);
154 append($Cat{substr($cat, 0, 1)}
155 ||= [], $code);
156 # 005F: SPACING UNDERSCORE
157 append($Cat{Word} ||= [], $code)
158 if $cat =~ /^[LMN]/ or $code eq "005F";
159 append($Cat{Alnum} ||= [], $code)
160 if $cat =~ /^[LMN]/;
161 append($Cat{Alpha} ||= [], $code)
162 if $cat =~ /^[LM]/;
163 # 0009: HORIZONTAL TABULATION
164 # 000A: LINE FEED
165 # 000B: VERTICAL TABULATION
166 # 000C: FORM FEED
167 # 000D: CARRIAGE RETURN
168 # 0020: SPACE
169 append($Cat{Space} ||= [], $code)
170 if $cat =~ /^Z/ ||
171 $code =~ /^(0009|000A|000B|000C|000D)$/;
172 append($Cat{SpacePerl} ||= [], $code)
173 if $cat =~ /^Z/ ||
174 $code =~ /^(0009|000A|000C|000D)$/;
175 append($Cat{Blank} ||= [], $code)
176 if $code =~ /^(0020|0009)$/ ||
177 $cat =~ /^Z[^lp]$/;
178 append($Cat{Digit} ||= [], $code) if $cat eq "Nd";
179 append($Cat{Upper} ||= [], $code) if $cat eq "Lu";
180 append($Cat{Lower} ||= [], $code) if $cat eq "Ll";
181 append($Cat{Title} ||= [], $code) if $cat eq "Lt";
182 append($Cat{ASCII} ||= [], $code) if $code le "007F";
183 append($Cat{Cntrl} ||= [], $code) if $cat =~ /^C/;
184 append($Cat{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/;
185 append($Cat{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/;
186 append($Cat{Punct} ||= [], $code) if $cat =~ /^P/;
187 # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
188 append($Cat{XDigit} ||= [], $code)
189 if $code =~ /^00(3[0-9]|[46][1-6])$/;
190
191 append($To{Upper} ||= [], $code, $upper) if $upper;
192 append($To{Lower} ||= [], $code, $lower) if $lower;
193 append($To{Title} ||= [], $code, $title) if $title;
194 append($To{Digit} ||= [], $code, $decimal) if $decimal;
195
196 append(\@Bidi, $code, $bidi);
197 append($Bidi{$bidi} ||= [], $code);
198
199 append(\@Comb, $code, $comb) if $comb;
200
201 if ($deco) {
202 append(\@Deco, $code, $deco);
203 if ($deco =~/^<(\w+)>/) {
204 append($Deco{Compat} ||= [], $code);
205 append($DC{$1} ||= [], $code);
206 } else {
207 append($Deco{Canon} ||= [], $code);
208 }
209 }
210
211 append(\@Number, $code, $number) if $number;
212
213 append(\@Mirrored, $code) if $mirrored eq "Y";
214 }
215 }
216
217 flush(\@Name, "Name.pl");
218
219 foreach my $cat (sort keys %Cat) {
220 flush($Cat{$cat}, "Is/$cat.pl");
221 }
222
223 foreach my $to (sort keys %To) {
224 flush($To{$to}, "To/$to.pl");
225 }
226
227 flush(\@Bidi, "Bidirectional.pl");
228 foreach my $bidi (sort keys %Bidi) {
229 flush($Bidi{$bidi}, "Is/Bidi$bidi.pl");
230 }
231
232 flush(\@Comb, "CombiningClass.pl");
233
234 flush(\@Deco, "Decomposition.pl");
235 foreach my $deco (sort keys %Deco) {
236 flush($Deco{$deco}, "Is/Deco$deco.pl");
237 }
238 foreach my $dc (sort keys %DC) {
239 flush($DC{$dc}, "Is/DC$dc.pl");
240 }
241
242 flush(\@Number, "Number.pl");
243
244 flush(\@Mirrored, "Is/Mirrored.pl");
245} else {
246 die "$0: Unicode.txt: $!\n";
247}
248
249# The general cateory can be written out already now.
250
251flush(\@General, "Category.pl");
252
253#
254# Read in the LineBrk.txt.
255#
256
257if (open(my $LineBrk, "LineBrk.txt")) {
258 my @Lbrk;
259 my %Lbrk;
260
261 while (<$LineBrk>) {
262 next if /^\#/ || /^\s*$/;
263 s/\s+$//;
264 s/\s*\#.*//;
265 next unless /^([0-9a-f]+)(?:\.\.([0-9a-f]+))?\s*;\s*(.+)$/i;
266
267 my ($first, $last, $lbrk) = ($1, $2, $3);
268
269 append(\@Lbrk, $first, $lbrk);
270 append($Lbrk{$lbrk} ||= [], $first);
271 if (defined $last) {
272 extend(\@Lbrk, $last);
273 extend($Lbrk{$lbrk}, $last);
274 }
275 }
276
277 flush(\@Lbrk, "Lbrk.pl");
278 foreach my $lbrk (sort keys %Lbrk) {
279 flush($Lbrk{$lbrk}, "Is/Lbrk$lbrk.pl");
280 }
281} else {
282 die "$0: LineBrk.txt: $!\n";
283}
284
285#
286# Read in the ArabShap.txt.
287#
288
289if (open(my $ArabShap, "ArabShap.txt")) {
290 my @ArabLink;
291 my @ArabLinkGroup;
292
293 while (<$ArabShap>) {
294 next if /^\#/ || /^\s*$/;
295 next unless /^[0-9a-f]+\s*;/i;
296 s/\s+$//;
297 my ($code, $name, $link, $linkgroup) = split(/\s*;\s*/);
298
299 append(\@ArabLink, $code, $link);
300 append(\@ArabLinkGroup, $code, $linkgroup);
301 }
302
303 flush(\@ArabLink, "ArabLink.pl");
304 flush(\@ArabLinkGroup, "ArabLnkGrp.pl");
305} else {
306 die "$0: ArabShap.txt: $!\n";
307}
308
309#
310# Read in the Jamo.txt.
311#
312
313if (open(my $Jamo, "Jamo.txt")) {
314 my @Short;
315
316 while (<$Jamo>) {
317 next if /^\#/ || /^\s*$/;
318 next unless /^[0-9a-f]+\s*;/i;
319 s/\s*\#.*//;
320 s/\s+$//;
321 my ($code, $short) = split(/\s*;\s*/);
322
323 append(\@Short, $code, $short);
324 }
325
326 flush(\@Short, "JamoShort.pl");
327} else {
328 die "$0: Jamo.txt: $!\n";
329}
330
331#
332# Read in the Scripts.txt.
333#
334
335my @Scripts;
336
337if (open(my $Scripts, "Scripts.txt")) {
338 while (<$Scripts>) {
339 next if /^\#/ || /^\s*$/;
340 s/\s*\#.*//;
341 s/\s+$//;
342 next unless /^([0-9a-f]+)(?:\.\.([0-9a-f]+))?\s*;\s*(.+)$/i;
343
344 # Wait until all the scripts have been read since
345 # they are not listed in numeric order.
346 push @Scripts, [ hex($1), $1, $2, $3 ];
347 }
348} else {
349 die "$0: Scripts.txt: $!\n";
350}
351
352# Now append the scripts properties in their code point order.
353
354my %Script;
355my $Scripts = [];
356
357for my $script (sort { $a->[0] <=> $b->[0] } @Scripts) {
358 my ($code, $first, $last, $name) = @$script;
359 append($Scripts, $first, $name);
360 append($Script{$name} ||= [], $first, $name);
361 if (defined $last) {
362 extend($Scripts, $last);
363 extend($Script{$name}, $last);
364 }
365 unless (defined $In{$name}) {
366 $In{$name} = $InId++;
367 $InIn{$name} = $Script{$name};
368 }
369}
370
371# Scripts.pl can be written out already now.
372
373flush(\@Scripts, "Scripts.pl");
374
375# Common is everything not explicitly assigned to a Script
376
377$In{Common} = $InId++;
378my $Common = inverse($Scripts);
379$InIn{Common} = $Common;
380
381#
382# Read in the Blocks.txt.
383#
384
385my @Blocks;
386my %Blocks;
387
388if (open(my $Blocks, "Blocks.txt")) {
389 while (<$Blocks>) {
390 next if /^\#/ || /^\s*$/;
391 s/\s*\#.*//;
392 s/\s+$//;
393 next unless /^([0-9a-f]+)\.\.([0-9a-f]+)\s*;\s*(.+)$/i;
394
395 my ($first, $last, $name) = ($1, $2, $3);
396
397 # If there's a naming conflict (the script names are
398 # in uppercase), the name of the block has " Block"
399 # appended to it.
400 $name = "$name Block" if defined $In{"\U$name"};
401
402 append(\@Blocks, $first, $name);
403 append($Blocks{$name} ||= [], $first, $name);
404 if (defined $last) {
405 extend(\@Blocks, $last);
406 extend($Blocks{$name}, $last);
407 }
408 unless (defined $In{$name}) {
409 $In{$name} = $InId++;
410 $InIn{$name} = $Blocks{$name};
411 }
412 }
413} else {
414 die "$0: Blocks.txt: $!\n";
415}
416
417# Blocks.pl can be written out already now.
418
419flush(\@Blocks, "Blocks.pl");
420
421#
422# Read in the PropList.txt. It contains extended properties not
423# listed in the Unicode.txt, such as 'Other_Alphabetic':
424# alphabetic but not of the general category L; many modifiers
425# belong to this extended property category: while they are not
426# alphabets, they are alphabetic in nature.
427#
428
429my @Props;
430
431if (open(my $Props, "PropList.txt")) {
432 while (<$Props>) {
433 next if /^\#/ || /^\s*$/;
434 s/\s*\#.*//;
435 s/\s+$//;
436 next unless /^([0-9a-f]+)(?:\.\.([0-9a-f]+))?\s*;\s*(\w+)/i;
437
438 # Wait until all the extended properties have been read since
439 # they are not listed in numeric order.
440 push @Props, [ hex($1), $1, $2, $3 ];
441 }
442} else {
443 die "$0: PropList.txt: $!\n";
444}
445
446# Now append the extended properties in their code point order.
447
448my %Prop;
449my $Props = [];
450
451for my $prop (sort { $a->[0] <=> $b->[0] } @Props) {
452 my ($code, $first, $last, $name) = @$prop;
453 append($Props, $first, $name);
454 append($Prop{$name} ||= [], $first, $name);
455 if (defined $last) {
456 extend($Props, $last);
457 extend($Prop{$name}, $last);
458 }
459 unless (defined $In{$name}) {
460 $In{$name} = $InId++;
461 $InIn{$name} = $Prop{$name};
462 }
463}
464
465# Assigned is everything not Cn aka Noncharacter_Code_Point
466
467$In{Assigned} = $InId++;
468my $Assigned = inverse($Prop{Noncharacter_Code_Point});
469$InIn{Assigned} = $Assigned;
470
471sub merge_general_and_extended {
472 my ($name, $general, $extended) = @_;
473 my $merged;
474
475 push @$merged,
476 map { pop @{$_}; $_ }
477 sort { $a->[2] <=> $b->[2] }
478 map { [ $_->[0], $_->[1], hex($_->[0]) ] }
479 ($general ?
480 map { ref $_ ? @$_ : $_ }
481 @Cat {ref $general ? @$general : $general } :
482 (),
483 $extended ?
484 map { ref $_ ? @$_ : $_ }
485 @Prop{ref $extended ? @$extended : $extended} :
486 ());
487
488 $In{$name} = $InId++;
489 $InIn{$name} = $merged;
490
491 return $merged;
492}
493
494# Alphabetic is L and Other_Alphabetic.
495
496my $Alphabetic =
497 merge_general_and_extended('Alphabetic', 'L', 'Other_Alphabetic');
498
499# Lowercase is Ll and Other_Lowercase.
500
501my $Lowercase =
502 merge_general_and_extended('Lowercase', 'Ll', 'Other_Lowercase');
503
504# Uppercase is Lu and Other_Uppercase.
505
506my $Uppercase =
507 merge_general_and_extended('Uppercase', 'Lu', 'Other_Uppercase');
508
509# Math is Sm and Other_Math.
510
511my $Math =
512 merge_general_and_extended('Math', 'Sm', 'Other_Math');
513
514# Lampersand is Ll, Lu, and Lt.
515
516my $Lampersand =
517 merge_general_and_extended('Lampersand', [ qw(Ll Lu Lt) ]);
518
519# ID_Start is Ll, Lu, Lt, Lm, Lo, and Nl.
520
521my $ID_Start =
522 merge_general_and_extended('ID_Start', [ qw(Ll Lu Lt Lm Lo Nl) ]);
523
524# ID_Continue is ID_Start, Mn, Mc, Nd, and Pc.
525
526my $ID_Continue =
527 merge_general_and_extended('ID_Continue', [ qw(Ll Lu Lt Lm Lo Nl
528 Mn Mc Nd Pc) ]);
529
530#
531# Any is any.
532#
533
534$In{Any} = $InId++;
535my $Any = [ [ 0, sprintf("%04X", $LastUnicodeCodepoint) ] ];
536$InIn{Any} = $Any;
537
538#
539# mapping() will be used to write out the In and Is virtual mappings.
540#
541
542sub mapping {
543 my ($map, $name) = @_;
544
545 if (open(my $fh, ">$name.pl")) {
546 print "$name.pl\n";
547 header($fh);
548
549 # The %pat will hold a hash that maps the first two
550 # lowercased letters of a class to a 'fuzzified' regular
551 # expression that points to the real mapping.
552
553 my %pat;
554
555 # But first write out the offical name to real name
556 # (the filename) mapping.
557
558 print $fh <<EOT;
559%utf8::${name} =
560(
561EOT
562 for my $i (sort keys %$map) {
563 my $pat = $i;
564 # Here is the 'fuzzification': accept any space,
565 # dash, or underbar where in the official name
566 # there is space or a dash (or underbar, but
567 # there never is).
568 $pat =~ s/([- _])/(?:[-_]|\\s+)?/g;
569 # The prefix length of 2 is enough spread,
570 # and besides, we have 'Yi' as an In category.
571 push @{$pat{lc(substr($i, 0, 2))}}, [ $i, $pat ];
572 print $fh "'$i' => '$map->{$i}',\n";
573 }
574 print $fh <<EOT;
575);
576EOT
577
578 # Now write out the %pat mapping.
579
580 print $fh <<EOT;
581%utf8::${name}Pat =
582(
583EOT
584 foreach my $prefix (sort keys %pat) {
585 print $fh "'$prefix' => {\n";
586 foreach my $ipat (@{$pat{$prefix}}) {
587 my ($i, $pat) = @$ipat;
588 print $fh "\t'$pat' => '$map->{$i}',\n";
589 }
590 print $fh "},\n";
591 }
592 print $fh <<EOT;
593);
594EOT
595
596 close($fh);
597 } else {
598 die "$0: $name.pl: $!\n";
599 }
600}
601
602#
603# Write out the virtual In mappings.
604#
605
606mapping(\%In, "In");
607
608# Easy low-calorie cheat.
609use File::Copy;
610copy("In/$In{Noncharacter_Code_Point}.pl", "Is/Cn.pl");
611
612#
613# Write out the real In mappings
614# (the In.pl written out just above has the virtual In mappings)
615#
616
617foreach my $in (sort { $In{$a} <=> $In{$b} } keys %In) {
618 flush($InIn{$in}, "In/$In{$in}.pl");
619}
620
621#
622# The mapping from General Category long forms to short forms is
623# currently hardwired here since no simple data file in the UCD
624# seems to do that.
625#
626
627my %Is = (
628 'Letter' => 'L',
629 'Uppercase Letter' => 'Lu',
630 'Lowercase Letter' => 'Ll',
631 'Titlecase Letter' => 'Lt',
632 'Modifier Letter' => 'Lm',
633 'Other Letter' => 'Lo',
634
635 'Mark' => 'M',
636 'Non-Spacing Mark' => 'Mn',
637 'Spacing Combining Mark' => 'Mc',
638 'Enclosing Mark' => 'Me',
639
640 'Separator' => 'Z',
641 'Space Separator' => 'Zs',
642 'Line Separator' => 'Zl',
643 'Paragraph Separator' => 'Zp',
644
645 'Number' => 'N',
646 'Decimal Digit Number' => 'Nd',
647 'Letter Number' => 'Nl',
648 'Other Number' => 'No',
649
650 'Punctuation' => 'P',
651 'Connector Punctuation' => 'Pc',
652 'Dash Punctuation' => 'Pd',
653 'Open Punctuation' => 'Ps',
654 'Close Punctuation' => 'Pe',
655 'Initial Punctuation' => 'Pi',
656 'Final Punctuation' => 'Pf',
657 'Other Punctuation' => 'Po',
658
659 'Symbol' => 'S',
660 'Math Symbol' => 'Sm',
661 'Currency Symbol' => 'Sc',
662 'Modifier Symbol' => 'Sk',
663 'Other Symbol' => 'So',
664
665 'Other' => 'C',
666 'Control' => 'Cc',
667 'Format' => 'Cf',
668 'Surrogate' => 'Cs',
669 'Private Use' => 'Co',
670 'Not Assigned' => 'Cn',
671 # 'Other' aliases
672 'Other Control' => 'Cc',
673 'Other Format' => 'Cf',
674 'Other Surrogate' => 'Cs',
675 'Other Private Use' => 'Co',
676 'Other Not Assigned' => 'Cn',
677);
678
679#
680# Write out the virtual Is mappings.
681#
682
683mapping(\%Is, "Is");
684
685# That's all, folks!
686