Porting/mk_PL_charclass.pl

   1 #!perl -w
   2 use 5.012;
   3 use strict;
   4 use warnings;
   5
   6 # This program outputs the 256 lines that form the guts of the PL_charclass
   7 # table.  The output should be used to manually replace the table contents in
   8 # perl.h.  Each line is a bit map of properties that the Unicode code point at
   9 # the corresponding position in the table array has.  The first line
  10 # corresponds to code point U+0000, NULL, the last line to U=00FF.  For an
  11 # application to see if the code point "i" has a particular property, it just
  12 # does
  13 #    'PL_charclass[i] & BIT'
  14 # The bit names are of the form '_CC_property_suffix', where 'CC' stands for
  15 # character class, and 'property' is the corresponding property, and 'suffix'
  16 # is one of '_A' to mean the property is true only if the corresponding code
  17 # point is ASCII, and '_L1' means that the range includes any Latin1
  18 # character (ISO-8859-1 including the C0 and C1 controls).  A property without
  19 # these suffixes does not have different forms for both ranges.
  20
  21 # The data in the table is pretty well set in stone, so that this program need
  22 # be run only when adding new properties to it.
  23
  24 my @properties = qw(
  25     ALNUMC_A
  26     ALNUMC_L1
  27     ALPHA_A
  28     ALPHA_L1
  29     BLANK_A
  30     BLANK_L1
  31     CHARNAME_CONT
  32     CNTRL_A
  33     CNTRL_L1
  34     DIGIT_A
  35     GRAPH_A
  36     GRAPH_L1
  37     IDFIRST_A
  38     IDFIRST_L1
  39     LOWER_A
  40     LOWER_L1
  41     OCTAL_A
  42     PRINT_A
  43     PRINT_L1
  44     PSXSPC_A
  45     PSXSPC_L1
  46     PUNCT_A
  47     PUNCT_L1
  48     SPACE_A
  49     SPACE_L1
  50     UPPER_A
  51     UPPER_L1
  52     WORDCHAR_A
  53     WORDCHAR_L1
  54     XDIGIT_A
  55 );
  56
  57 my @bits;   # Bit map for each code point
  58
  59 for my $ord (0..255) {
  60     my $char = chr($ord);
  61     utf8::upgrade($char);   # Important to use Unicode semantics!
  62     for my $property (@properties) {
  63         my $name = $property;
  64
  65         # The property name that corresponds to this doesn't have a suffix.
  66         # If is a latin1 version, no further checking is needed.
  67         if (! ($name =~ s/_L1$//)) {
  68
  69             # Here, isn't an L1.  It's either a special one or the suffix ends
  70             # in _A.  In the latter case, it's automatically false for
  71             # non-ascii.  The one current special is valid over the whole range.
  72             next if $name =~ s/_A$// && $ord >= 128;
  73
  74         }
  75         my $re;
  76         if ($name eq 'PUNCT') {;
  77
  78             # Sadly, this is inconsistent: \pP and \pS for the ascii range,
  79             # just \pP outside it.
  80             $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
  81         } elsif ($name eq 'CHARNAME_CONT') {;
  82             $re = qr/[-\w ():\xa0]/;
  83         } elsif ($name eq 'SPACE') {;
  84             $re = qr/\s/;
  85         } elsif ($name eq 'IDFIRST') {
  86             $re = qr/[_\p{Alpha}]/;
  87         } elsif ($name eq 'PSXSPC') {
  88             $re = qr/[\v\p{Space}]/;
  89         } elsif ($name eq 'WORDCHAR') {
  90             $re = qr/\w/;
  91         } elsif ($name eq 'ALNUMC') {
  92             # Like \w, but no underscore
  93             $re = qr/\p{Alnum}/;
  94         } elsif ($name eq 'OCTAL') {
  95             $re = qr/[0-7]/;
  96         } else {    # The remainder have the same name and values as Unicode
  97             $re = eval "qr/\\p{$name}/";
  98             use Carp;
  99             carp $@ if ! defined $re;
 100         }
 101         #print "$ord, $name $property, $re\n";
 102         if ($char =~ $re) {  # Add this property if matches
 103             $bits[$ord] .= '|' if $bits[$ord];
 104             $bits[$ord] .= "_CC_$property";
 105         }
 106     }
 107     #print __LINE__, " $ord $char $bits[$ord]\n";
 108 }
 109
 110 # Names of C0 controls
 111 my @C0 = qw (
 112                 NUL
 113                 SOH
 114                 STX
 115                 ETX
 116                 EOT
 117                 ENQ
 118                 ACK
 119                 BEL
 120                 BS
 121                 HT
 122                 LF
 123                 VT
 124                 FF
 125                 CR
 126                 SO
 127                 SI
 128                 DLE
 129                 DC1
 130                 DC2
 131                 DC3
 132                 DC4
 133                 NAK
 134                 SYN
 135                 ETB
 136                 CAN
 137                 EOM
 138                 SUB
 139                 ESC
 140                 FS
 141                 GS
 142                 RS
 143                 US
 144             );
 145
 146 # Names of C1 controls, plus the adjacent DEL
 147 my @C1 = qw(
 148                 DEL
 149                 PAD
 150                 HOP
 151                 BPH
 152                 NBH
 153                 IND
 154                 NEL
 155                 SSA
 156                 ESA
 157                 HTS
 158                 HTJ
 159                 VTS
 160                 PLD
 161                 PLU
 162                 RI
 163                 SS2
 164                 SS3
 165                 DCS
 166                 PU1
 167                 PU2
 168                 STS
 169                 CCH
 170                 MW
 171                 SPA
 172                 EPA
 173                 SOS
 174                 SGC
 175                 SCI
 176                 CSI
 177                 ST
 178                 OSC
 179                 PM
 180                 APC
 181             );
 182
 183 # Output the table using fairly short names for each char.
 184 for my $ord (0..255) {
 185     my $name;
 186     if ($ord < 32) {    # A C0 control
 187         $name = $C0[$ord];
 188     } elsif ($ord > 32 && $ord < 127) { # Graphic
 189         $name = "'" . chr($ord) . "'";
 190     } elsif ($ord >= 127 && $ord <= 0x9f) {
 191         $name = $C1[$ord - 127];    # A C1 control + DEL
 192     } else {    # SPACE, or, if Latin1, shorten the name */
 193         use charnames();
 194         $name = charnames::viacode($ord);
 195         $name =~ s/LATIN CAPITAL LETTER //
 196         || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
 197     }
 198     printf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
 199 }
 200