Porting/mk_PL_charclass.pl

   1 #!perl -w
   2 use 5.012;
   3 use strict;
   4 use warnings;
   5 use Config;
   6
   7 # This program outputs the 256 lines that form the guts of the PL_charclass
   8 # table.  The output should be used to manually replace the table contents in
   9 # l1_charclass_tab.h.  Each line is a bit map of properties that the Unicode
  10 # code point at the corresponding position in the table array has.  The first
  11 # line corresponds to code point U+0000, NULL, the last line to U=00FF.  For
  12 # an application to see if the code point "i" has a particular property, it
  13 # just does
  14 #    'PL_charclass[i] & BIT'
  15 # The bit names are of the form '_CC_property_suffix', where 'CC' stands for
  16 # character class, and 'property' is the corresponding property, and 'suffix'
  17 # is one of '_A' to mean the property is true only if the corresponding code
  18 # point is ASCII, and '_L1' means that the range includes any Latin1
  19 # character (ISO-8859-1 including the C0 and C1 controls).  A property without
  20 # these suffixes does not have different forms for both ranges.
  21
  22 # The data in the table is pretty well set in stone, so that this program need
  23 # be run only when adding new properties to it.
  24
  25 my @properties = qw(
  26     ALNUMC_A
  27     ALNUMC_L1
  28     ALPHA_A
  29     ALPHA_L1
  30     BLANK_A
  31     BLANK_L1
  32     CHARNAME_CONT
  33     CNTRL_A
  34     CNTRL_L1
  35     DIGIT_A
  36     GRAPH_A
  37     GRAPH_L1
  38     IDFIRST_A
  39     IDFIRST_L1
  40     LOWER_A
  41     LOWER_L1
  42     OCTAL_A
  43     PRINT_A
  44     PRINT_L1
  45     PSXSPC_A
  46     PSXSPC_L1
  47     PUNCT_A
  48     PUNCT_L1
  49     SPACE_A
  50     SPACE_L1
  51     UPPER_A
  52     UPPER_L1
  53     WORDCHAR_A
  54     WORDCHAR_L1
  55     XDIGIT_A
  56 );
  57
  58 # Read in the case fold mappings.
  59 my %folded_closure;
  60 my $file="$Config{privlib}/unicore/CaseFolding.txt";
  61 open my $fh, "<", $file or die "Failed to read '$file': $!";
  62 while (<$fh>) {
  63     chomp;
  64
  65     # Lines look like (without the initial '#'
  66     #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
  67     my ($line, $comment) = split / \s+ \# \s+ /x, $_;
  68     next if $line eq "" || substr($line, 0, 1) eq '#';
  69     my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
  70
  71     my $from = hex $hex_from;
  72
  73     # Perl only deals with C and F folds
  74     next if $fold_type ne 'C' and $fold_type ne 'F';
  75
  76     # Get each code point in the range that participates in this line's fold.
  77     # The hash has keys of each code point in the range, and values of what it
  78     # folds to and what folds to it
  79     foreach my $hex_fold (@folded) {
  80         my $fold = hex $hex_fold;
  81         push @{$folded_closure{$fold}}, $from if $fold < 256;
  82         push @{$folded_closure{$from}}, $fold if $from < 256;
  83     }
  84 }
  85
  86 # Now having read all the lines, combine them into the full closure of each
  87 # code point in the range by adding lists together that share a common element
  88 foreach my $folded (keys %folded_closure) {
  89     foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
  90         push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
  91     }
  92 }
  93
  94 my @bits;   # Bit map for each code point
  95
  96 foreach my $folded (keys %folded_closure) {
  97     $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
  98                                                 @{$folded_closure{$folded}};
  99 }
 100
 101 for my $ord (0..255) {
 102     my $char = chr($ord);
 103     utf8::upgrade($char);   # Important to use Unicode semantics!
 104     for my $property (@properties) {
 105         my $name = $property;
 106
 107         # The property name that corresponds to this doesn't have a suffix.
 108         # If is a latin1 version, no further checking is needed.
 109         if (! ($name =~ s/_L1$//)) {
 110
 111             # Here, isn't an L1.  It's either a special one or the suffix ends
 112             # in _A.  In the latter case, it's automatically false for
 113             # non-ascii.  The one current special is valid over the whole range.
 114             next if $name =~ s/_A$// && $ord >= 128;
 115
 116         }
 117         my $re;
 118         if ($name eq 'PUNCT') {;
 119
 120             # Sadly, this is inconsistent: \pP and \pS for the ascii range,
 121             # just \pP outside it.
 122             $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
 123         } elsif ($name eq 'CHARNAME_CONT') {;
 124             $re = qr/[-\w ():\xa0]/;
 125         } elsif ($name eq 'SPACE') {;
 126             $re = qr/\s/;
 127         } elsif ($name eq 'IDFIRST') {
 128             $re = qr/[_\p{Alpha}]/;
 129         } elsif ($name eq 'PSXSPC') {
 130             $re = qr/[\v\p{Space}]/;
 131         } elsif ($name eq 'WORDCHAR') {
 132             $re = qr/\w/;
 133         } elsif ($name eq 'ALNUMC') {
 134             # Like \w, but no underscore
 135             $re = qr/\p{Alnum}/;
 136         } elsif ($name eq 'OCTAL') {
 137             $re = qr/[0-7]/;
 138         } else {    # The remainder have the same name and values as Unicode
 139             $re = eval "qr/\\p{$name}/";
 140             use Carp;
 141             carp $@ if ! defined $re;
 142         }
 143         #print "$ord, $name $property, $re\n";
 144         if ($char =~ $re) {  # Add this property if matches
 145             $bits[$ord] .= '|' if $bits[$ord];
 146             $bits[$ord] .= "_CC_$property";
 147         }
 148     }
 149     #print __LINE__, " $ord $char $bits[$ord]\n";
 150 }
 151
 152 # Names of C0 controls
 153 my @C0 = qw (
 154                 NUL
 155                 SOH
 156                 STX
 157                 ETX
 158                 EOT
 159                 ENQ
 160                 ACK
 161                 BEL
 162                 BS
 163                 HT
 164                 LF
 165                 VT
 166                 FF
 167                 CR
 168                 SO
 169                 SI
 170                 DLE
 171                 DC1
 172                 DC2
 173                 DC3
 174                 DC4
 175                 NAK
 176                 SYN
 177                 ETB
 178                 CAN
 179                 EOM
 180                 SUB
 181                 ESC
 182                 FS
 183                 GS
 184                 RS
 185                 US
 186             );
 187
 188 # Names of C1 controls, plus the adjacent DEL
 189 my @C1 = qw(
 190                 DEL
 191                 PAD
 192                 HOP
 193                 BPH
 194                 NBH
 195                 IND
 196                 NEL
 197                 SSA
 198                 ESA
 199                 HTS
 200                 HTJ
 201                 VTS
 202                 PLD
 203                 PLU
 204                 RI
 205                 SS2
 206                 SS3
 207                 DCS
 208                 PU1
 209                 PU2
 210                 STS
 211                 CCH
 212                 MW
 213                 SPA
 214                 EPA
 215                 SOS
 216                 SGC
 217                 SCI
 218                 CSI
 219                 ST
 220                 OSC
 221                 PM
 222                 APC
 223             );
 224
 225 # Output the table using fairly short names for each char.
 226 for my $ord (0..255) {
 227     my $name;
 228     if ($ord < 32) {    # A C0 control
 229         $name = $C0[$ord];
 230     } elsif ($ord > 32 && $ord < 127) { # Graphic
 231         $name = "'" . chr($ord) . "'";
 232     } elsif ($ord >= 127 && $ord <= 0x9f) {
 233         $name = $C1[$ord - 127];    # A C1 control + DEL
 234     } else {    # SPACE, or, if Latin1, shorten the name */
 235         use charnames();
 236         $name = charnames::viacode($ord);
 237         $name =~ s/LATIN CAPITAL LETTER //
 238         || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
 239     }
 240     printf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
 241 }
 242