[perl5.git] / Porting / mk_PL_charclass.pl

#!perl -w
use 5.012;
use strict;
use warnings;

# This program outputs the 256 lines that form the guts of the PL_charclass
# table.  The output should be used to manually replace the table contents in
# perl.h.  Each line is a bit map of properties that the Unicode code point at
# the corresponding position in the table array has.  The first line
# corresponds to code point U+0000, NULL, the last line to U=00FF.  For an
# application to see if the code point "i" has a particular property, it just
# does
#    'PL_charclass[i] & BIT'
# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
# character class, and 'property' is the corresponding property, and 'suffix'
# is one of '_A' to mean the property is true only if the corresponding code
# point is ASCII, and '_L1' means that the range includes any Latin1
# character (ISO-8859-1 including the C0 and C1 controls).  A property without
# these suffixes does not have different forms for both ranges.

# The data in the table is pretty well set in stone, so that this program need
# be run only when adding new properties to it.

my @properties = qw(
    ALNUMC_A
    ALNUMC_L1
    ALPHA_A
    ALPHA_L1
    BLANK_A
    BLANK_L1
    CHARNAME_CONT
    CNTRL_A
    CNTRL_L1
    DIGIT_A
    GRAPH_A
    GRAPH_L1
    IDFIRST_A
    IDFIRST_L1
    LOWER_A
    LOWER_L1
    OCTAL_A
    PRINT_A
    PRINT_L1
    PSXSPC_A
    PSXSPC_L1
    PUNCT_A
    PUNCT_L1
    SPACE_A
    SPACE_L1
    UPPER_A
    UPPER_L1
    WORDCHAR_A
    WORDCHAR_L1
    XDIGIT_A
);

my @bits;   # Bit map for each code point

for my $ord (0..255) {
    my $char = chr($ord);
    utf8::upgrade($char);   # Important to use Unicode semantics!
    for my $property (@properties) {
        my $name = $property;

        # The property name that corresponds to this doesn't have a suffix.
        # If is a latin1 version, no further checking is needed.
        if (! ($name =~ s/_L1$//)) {

            # Here, isn't an L1.  It's either a special one or the suffix ends
            # in _A.  In the latter case, it's automatically false for
            # non-ascii.  The one current special is valid over the whole range.
            next if $name =~ s/_A$// && $ord >= 128;

        }
        my $re;
        if ($name eq 'PUNCT') {;

            # Sadly, this is inconsistent: \pP and \pS for the ascii range,
            # just \pP outside it.
            $re = qr/\p{Punct}|[^\P{Symbol}\P{ASCII}]/;
        } elsif ($name eq 'CHARNAME_CONT') {;
            $re = qr/[-\w ():\xa0]/;
        } elsif ($name eq 'SPACE') {;
            $re = qr/\s/;
        } elsif ($name eq 'IDFIRST') {
            $re = qr/[_\p{Alpha}]/;
        } elsif ($name eq 'PSXSPC') {
            $re = qr/[\v\p{Space}]/;
        } elsif ($name eq 'WORDCHAR') {
            $re = qr/\w/;
        } elsif ($name eq 'ALNUMC') {
            # Like \w, but no underscore
            $re = qr/[^_\W]/;
        } elsif ($name eq 'OCTAL') {
            $re = qr/[0-7]/;
        } else {    # The remainder have the same name and values as Unicode
            $re = eval "qr/\\p{$name}/";
            use Carp;
            carp $@ if ! defined $re;
        }
        #print "$ord, $name $property, $re\n";
        if ($char =~ $re) {  # Add this property if matches
            $bits[$ord] .= '|' if $bits[$ord];
            $bits[$ord] .= "_CC_$property";
        }
    }
    #print __LINE__, " $ord $char $bits[$ord]\n";
}

# Names of C0 controls
my @C0 = qw (
                NUL
                SOH
                STX
                ETX
                EOT
                ENQ
                ACK
                BEL
                BS
                HT
                LF
                VT
                FF
                CR
                SO
                SI
                DLE
                DC1
                DC2
                DC3
                DC4
                NAK
                SYN
                ETB
                CAN
                EOM
                SUB
                ESC
                FS
                GS
                RS
                US
            );

# Names of C1 controls, plus the adjacent DEL
my @C1 = qw(
                DEL
                PAD
                HOP
                BPH
                NBH
                IND
                NEL
                SSA
                ESA
                HTS
                HTJ
                VTS
                PLD
                PLU
                RI 
                SS2
                SS3
                DCS
                PU1
                PU2
                STS
                CCH
                MW 
                SPA
                EPA
                SOS
                SGC
                SCI
                CSI
                ST 
                OSC
                PM 
                APC
            );

# Output the table using fairly short names for each char.
for my $ord (0..255) {
    my $name;
    if ($ord < 32) {    # A C0 control
        $name = $C0[$ord];
    } elsif ($ord > 32 && $ord < 127) { # Graphic
        $name = "'" . chr($ord) . "'";
    } elsif ($ord >= 127 && $ord <= 0x9f) {
        $name = $C1[$ord - 127];    # A C1 control + DEL
    } else {    # SPACE, or, if Latin1, shorten the name */
        use charnames();
        $name = charnames::viacode($ord);
        $name =~ s/LATIN CAPITAL LETTER //
        || $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
    }
    printf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
}
Commit	Line	Data
9c68f0ab KW	1	#!perl -w
	2	use 5.012;
	3	use strict;
	4	use warnings;
	5
	6	# This program outputs the 256 lines that form the guts of the PL_charclass
	7	# table. The output should be used to manually replace the table contents in
	8	# perl.h. Each line is a bit map of properties that the Unicode code point at
	9	# the corresponding position in the table array has. The first line
	10	# corresponds to code point U+0000, NULL, the last line to U=00FF. For an
	11	# application to see if the code point "i" has a particular property, it just
	12	# does
	13	# 'PL_charclass[i] & BIT'
	14	# The bit names are of the form '_CC_property_suffix', where 'CC' stands for
	15	# character class, and 'property' is the corresponding property, and 'suffix'
	16	# is one of '_A' to mean the property is true only if the corresponding code
	17	# point is ASCII, and '_L1' means that the range includes any Latin1
	18	# character (ISO-8859-1 including the C0 and C1 controls). A property without
	19	# these suffixes does not have different forms for both ranges.
	20
	21	# The data in the table is pretty well set in stone, so that this program need
	22	# be run only when adding new properties to it.
	23
	24	my @properties = qw(
	25	ALNUMC_A
	26	ALNUMC_L1
	27	ALPHA_A
	28	ALPHA_L1
	29	BLANK_A
	30	BLANK_L1
	31	CHARNAME_CONT
	32	CNTRL_A
	33	CNTRL_L1
	34	DIGIT_A
	35	GRAPH_A
	36	GRAPH_L1
	37	IDFIRST_A
	38	IDFIRST_L1
	39	LOWER_A
	40	LOWER_L1
	41	OCTAL_A
	42	PRINT_A
	43	PRINT_L1
	44	PSXSPC_A
	45	PSXSPC_L1
	46	PUNCT_A
	47	PUNCT_L1
	48	SPACE_A
	49	SPACE_L1
	50	UPPER_A
	51	UPPER_L1
	52	WORDCHAR_A
	53	WORDCHAR_L1
	54	XDIGIT_A
	55	);
	56
	57	my @bits; # Bit map for each code point
	58
	59	for my $ord (0..255) {
	60	my $char = chr($ord);
	61	utf8::upgrade($char); # Important to use Unicode semantics!
	62	for my $property (@properties) {
	63	my $name = $property;
	64
65	# The property name that corresponds to this doesn't have a suffix.
66	# If is a latin1 version, no further checking is needed.
67	if (! ($name =~ s/_L1$//)) {
68
69	# Here, isn't an L1. It's either a special one or the suffix ends
70	# in _A. In the latter case, it's automatically false for
71	# non-ascii. The one current special is valid over the whole range.
72	next if $name =~ s/_A$// && $ord >= 128;
73
74	}
75	my $re;
76	if ($name eq 'PUNCT') {;
77
78	# Sadly, this is inconsistent: \pP and \pS for the ascii range,
79	# just \pP outside it.
80	$re = qr/\p{Punct}\|[^\P{Symbol}\P{ASCII}]/;
81	} elsif ($name eq 'CHARNAME_CONT') {;
82	$re = qr/[-\w ():\xa0]/;
83	} elsif ($name eq 'SPACE') {;
84	$re = qr/\s/;
85	} elsif ($name eq 'IDFIRST') {
86	$re = qr/[_\p{Alpha}]/;
87	} elsif ($name eq 'PSXSPC') {
88	$re = qr/[\v\p{Space}]/;
89	} elsif ($name eq 'WORDCHAR') {
90	$re = qr/\w/;
91	} elsif ($name eq 'ALNUMC') {
92	# Like \w, but no underscore
93	$re = qr/[^_\W]/;
94	} elsif ($name eq 'OCTAL') {
95	$re = qr/[0-7]/;
96	} else { # The remainder have the same name and values as Unicode
97	$re = eval "qr/\\p{$name}/";
98	use Carp;
99	carp $@ if ! defined $re;
100	}
101	#print "$ord, $name $property, $re\n";
102	if ($char =~ $re) { # Add this property if matches
103	$bits[$ord] .= '\|' if $bits[$ord];
104	$bits[$ord] .= "_CC_$property";
105	}
106	}
107	#print __LINE__, " $ord $char $bits[$ord]\n";
108	}
109
110	# Names of C0 controls
111	my @C0 = qw (
112	NUL
113	SOH
114	STX
115	ETX
116	EOT
117	ENQ
118	ACK
119	BEL
120	BS
121	HT
122	LF
123	VT
124	FF
125	CR
126	SO
127	SI
128	DLE
129	DC1
130	DC2
131	DC3
132	DC4
133	NAK
134	SYN
135	ETB
136	CAN
137	EOM
138	SUB
139	ESC
140	FS
141	GS
142	RS
143	US
144	);
145
146	# Names of C1 controls, plus the adjacent DEL
147	my @C1 = qw(
148	DEL
149	PAD
150	HOP
151	BPH
152	NBH
153	IND
154	NEL
155	SSA
156	ESA
157	HTS
158	HTJ
159	VTS
160	PLD
161	PLU
162	RI
163	SS2
164	SS3
165	DCS
166	PU1
167	PU2
168	STS
169	CCH
170	MW
171	SPA
172	EPA
173	SOS
174	SGC
175	SCI
176	CSI
177	ST
178	OSC
179	PM
180	APC
181	);
182
183	# Output the table using fairly short names for each char.
184	for my $ord (0..255) {
185	my $name;
186	if ($ord < 32) { # A C0 control
187	$name = $C0[$ord];
188	} elsif ($ord > 32 && $ord < 127) { # Graphic
189	$name = "'" . chr($ord) . "'";
190	} elsif ($ord >= 127 && $ord <= 0x9f) {
191	$name = $C1[$ord - 127]; # A C1 control + DEL
192	} else { # SPACE, or, if Latin1, shorten the name */
193	use charnames();
194	$name = charnames::viacode($ord);
195	$name =~ s/LATIN CAPITAL LETTER //
196	\|\| $name =~ s/LATIN SMALL LETTER (.*)/\L$1/;
197	}
198	printf "/* U+%02X %s */ %s,\n", $ord, $name, $bits[$ord];
199	}
200