Porting/mk_PL_charclass.pl

   1 #!perl -w
   2 use 5.012;
   3 use strict;
   4 use warnings;
   5 use Config;
   6
   7 # This program outputs the 256 lines that form the guts of the PL_charclass
   8 # table.  The output should be used to manually replace the table contents in
   9 # l1_charclass_tab.h.  Each line is a bit map of properties that the Unicode
  10 # code point at the corresponding position in the table array has.  The first
  11 # line corresponds to code point U+0000, NULL, the last line to U=00FF.  For
  12 # an application to see if the code point "i" has a particular property, it
  13 # just does
  14 #    'PL_charclass[i] & BIT'
  15 # The bit names are of the form '_CC_property_suffix', where 'CC' stands for
  16 # character class, and 'property' is the corresponding property, and 'suffix'
  17 # is one of '_A' to mean the property is true only if the corresponding code
  18 # point is ASCII, and '_L1' means that the range includes any Latin1
  19 # character (ISO-8859-1 including the C0 and C1 controls).  A property without
  20 # these suffixes does not have different forms for both ranges.
  21
  22 # The data in the table is pretty well set in stone, so that this program need
  23 # be run only when adding new properties to it.
  24
  25 my @properties = qw(
  26     ALNUMC_A
  27     ALNUMC_L1
  28     ALPHA_A
  29     ALPHA_L1
  30     BLANK_A
  31     BLANK_L1
  32     CHARNAME_CONT
  33     CNTRL_A
  34     CNTRL_L1
  35     DIGIT_A
  36     GRAPH_A
  37     GRAPH_L1
  38     IDFIRST_A
  39     IDFIRST_L1
  40     LOWER_A
  41     LOWER_L1
  42     OCTAL_A
  43     PRINT_A
  44     PRINT_L1
  45     PSXSPC_A
  46     PSXSPC_L1
  47     PUNCT_A
  48     PUNCT_L1
  49     SPACE_A
  50     SPACE_L1
  51     UPPER_A
  52     UPPER_L1
  53     WORDCHAR_A
  54     WORDCHAR_L1
  55     XDIGIT_A
  56 );
  57
  58 # Read in the case fold mappings.
  59 my %folded_closure;
  60 my $file="$Config{privlib}/unicore/CaseFolding.txt";
  61 open my $fh, "<", $file or die "Failed to read '$file': $!";
  62 while (<$fh>) {
  63     chomp;
  64
  65     # Lines look like (without the initial '#'
  66     #0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
  67     my ($line, $comment) = split / \s+ \# \s+ /x, $_;
  68     next if $line eq "" || substr($line, 0, 1) eq '#';
  69     my ($hex_from, $fold_type, @folded) = split /[\s;]+/, $line;
  70
  71     my $from = hex $hex_from;
  72
  73     # Perl only deals with C and F folds
  74     next if $fold_type ne 'C' and $fold_type ne 'F';
  75     next if $fold_type ne 'C';  # And for now, just single-char folds. XXX
  76
  77     # Get each code point in the range that participates in this line's fold.
  78     # The hash has keys of each code point in the range, and values of what it
  79     # folds to and what folds to it
  80     foreach my $hex_fold (@folded) {
  81         my $fold = hex $hex_fold;
  82         push @{$folded_closure{$fold}}, $from if $fold < 256;
  83         push @{$folded_closure{$from}}, $fold if $from < 256;
  84     }
  85 }
  86
  87 # Now having read all the lines, combine them into the full closure of each
  88 # code point in the range by adding lists together that share a common element
  89 foreach my $folded (keys %folded_closure) {
  90     foreach my $from (grep { $_ < 256 } @{$folded_closure{$folded}}) {
  91         push @{$folded_closure{$from}}, @{$folded_closure{$folded}};
  92     }
  93 }
  94
  95 my @bits;   # Bit map for each code point
  96
  97 foreach my $folded (keys %folded_closure) {
  98     $bits[$folded] = "_CC_NONLATIN1_FOLD" if grep { $_ > 255 }
  99                                                 @{$folded_closure{$folded}};
 100 }
 101
 102 for my $ord (0..255) {
 103     my $char = chr($ord);
 104     utf8::upgrade($char);   # Important to use Unicode semantics!
 105     for my $property (@properties) {
 106         my $name = $property;
 107
 108         # The property name that corresponds to this doesn't have a suffix.
 109         # If is a latin1 version, no further checking is needed.
 110         if (! ($name =~ s/_L1$//)) {
 111
 112             # Here, isn't an L1.  It's either a special one or the suffix ends
 113             # in _A.  In the latter case, it's automatically false for
 114             # non-ascii.  The one current special is valid over the whole range.
 115             next if $name =~ s/_A$// && $ord >= 128;
 116
 117         }
 118         my $re;
 119         if ($name eq 'PUNCT') {;
 120
 121             # Sadly, this is inconsistent: \pP and \pS for the ascii range,
 122             # just \pP outside it.
 123             $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
 124         } elsif ($name eq 'CHARNAME_CONT') {;
 125             $re = qr/[-\w ():\xa0]/;
 126         } elsif ($name eq 'SPACE') {;
 127             $re = qr/\s/;
 128         } elsif ($name eq 'IDFIRST') {
 129             $re = qr/[_\p{Alpha}]/;
 130         } elsif ($name eq 'PSXSPC') {
 131             $re = qr/[\v\p{Space}]/;
 132         } elsif ($name eq 'WORDCHAR') {
 133             $re = qr/\w/;
 134         } elsif ($name eq 'ALNUMC') {
 135             # Like \w, but no underscore
 136             $re = qr/\p{Alnum}/;
 137         } elsif ($name eq 'OCTAL') {
 138             $re = qr/[0-7]/;
 139         } else {    # The remainder have the same name and values as Unicode
 140             $re = eval "qr/\\p{$name}/";
 141             use Carp;
 142             carp $@ if ! defined $re;
 143         }
 144         #print "$ord, $name $property, $re\n";
 145         if ($char =~ $re) {  # Add this property if matches
 146             $bits[$ord] .= '|' if $bits[$ord];
 147             $bits[$ord] .= "_CC_$property";
 148         }
 149     }
 150     #print __LINE__, " $ord $char $bits[$ord]\n";
 151 }
 152
 153 # Names of C0 controls
 154 my @C0 = qw (
 155                 NUL
 156                 SOH
 157                 STX
 158                 ETX
 159                 EOT
 160                 ENQ
 161                 ACK
 162                 BEL
 163                 BS
 164                 HT
 165                 LF
 166                 VT
 167                 FF
 168                 CR
 169                 SO
 170                 SI
 171                 DLE
 172                 DC1
 173                 DC2
 174                 DC3
 175                 DC4
 176                 NAK
 177                 SYN
 178                 ETB
 179                 CAN
 180                 EOM
 181                 SUB
 182                 ESC
 183                 FS
 184                 GS
 185                 RS
 186                 US
 187             );
 188
 189 # Names of C1 controls, plus the adjacent DEL
 190 my @C1 = qw(
 191                 DEL
 192                 PAD
 193                 HOP
 194                 BPH
 195                 NBH
 196                 IND
 197                 NEL
 198                 SSA
 199                 ESA
 200                 HTS
 201                 HTJ
 202                 VTS
 203                 PLD
 204                 PLU
 205                 RI
 206                 SS2
 207                 SS3
 208                 DCS
 209                 PU1
 210                 PU2
 211                 STS
 212                 CCH
 213                 MW
 214                 SPA
 215                 EPA
 216                 SOS
 217                 SGC
 218                 SCI
 219                 CSI
 220                 ST
 221                 OSC
 222                 PM
 223                 APC
 224             );
 225
 226 # Output the table using fairly short names for each char.
 227 for my $ord (0..255) {
 228     my $name;
 229     if ($ord < 32) {    # A C0 control
 230         $name = $C0[$ord];
 231     } elsif ($ord > 32 && $ord < 127) { # Graphic
 232         $name = "'" . chr($ord) . "'";
 233     } elsif ($ord >= 127 && $ord <= 0x9f) {
 234         $name = $C1[$ord - 127];    # A C1 control + DEL
 235     } else {    # SPACE, or, if Latin1, shorten the name */
 236         use charnames();
 237         $name = charnames::viacode($ord);
 238         $name =~ s/LATIN CAPITAL LETTER //
 239         || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
 240     }
 241     printf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
 242 }
 243