This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
The public_html directory on dromedary is working again.
[perl5.git] / regen / unicode_constants.pl
CommitLineData
61dad979
KW
1use v5.16.0;
2use strict;
3use warnings;
4require 'regen/regen_lib.pl';
ad88cddb 5require 'regen/charset_translations.pl';
4b4853d1 6use Unicode::UCD;
61dad979
KW
7use charnames qw(:loose);
8
1b0f46bf 9my $out_fh = open_new('unicode_constants.h', '>',
ad88cddb 10 {style => '*', by => $0,
61dad979
KW
11 from => "Unicode data"});
12
13print $out_fh <<END;
d10c72f2 14
1b0f46bf
KW
15#ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */
16#define H_UNICODE_CONSTANTS 1
d10c72f2 17
4b4853d1
KW
18/* This file contains #defines for the version of Unicode being used and
19 * various Unicode code points. The values the code point macros expand to
20 * are the native Unicode code point, or all or portions of the UTF-8 encoding
21 * for the code point. In the former case, the macro name has the suffix
22 * "_NATIVE"; otherwise, the suffix "_UTF8".
61dad979 23 *
525b6419
KW
24 * The macros that have the suffix "_UTF8" may have further suffixes, as
25 * follows:
26 * "_FIRST_BYTE" if the value is just the first byte of the UTF-8
27 * representation; the value will be a numeric constant.
28 * "_TAIL" if instead it represents all but the first byte. This, and
29 * with no additional suffix are both string constants */
61dad979
KW
30
31END
32
4b4853d1
KW
33my $version = Unicode::UCD::UnicodeVersion();
34my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x;
35$dotdot = 0 unless defined $dotdot;
36
37print $out_fh <<END;
38#define UNICODE_MAJOR_VERSION $major
39#define UNICODE_DOT_VERSION $dot
40#define UNICODE_DOT_DOT_VERSION $dotdot
41
42END
43
9d8e3074 44# The data are at __DATA__ in this file.
61dad979 45
ad88cddb
KW
46my @data = <DATA>;
47
48foreach my $charset (get_supported_code_pages()) {
49 print $out_fh "\n" . get_conditional_compile_line_start($charset);
50
c30a0cf2 51 my @a2n = @{get_a2n($charset)};
ad88cddb 52
4a4b1311
KW
53 for ( @data ) {
54 chomp;
55
56 # Convert any '#' comments to /* ... */; empty lines and comments are
57 # output as blank lines
58 if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
59 my $comment_body = $1 // "";
60 if ($comment_body ne "") {
61 print $out_fh "/* $comment_body */\n";
62 }
63 else {
64 print $out_fh "\n";
65 }
66 next;
5a731a17 67 }
76837d21 68
4a4b1311
KW
69 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
70 (?: [\ ]+ ( [^ ]* ) )? # optional flag
71 (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required
72 /x)
73 {
74 die "Unexpected syntax at line $.: $_\n";
75 }
61dad979 76
4a4b1311
KW
77 my $name_or_cp = $1;
78 my $flag = $2;
79 my $desired_name = $3;
80
81 my $name;
82 my $cp;
83 my $U_cp; # code point in Unicode (not-native) terms
4a4b1311
KW
84
85 if ($name_or_cp =~ /^U\+(.*)/) {
86 $U_cp = hex $1;
87 $name = charnames::viacode($name_or_cp);
88 if (! defined $name) {
280ac755
KW
89 next if $flag =~ /skip_if_undef/;
90 die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
4a4b1311
KW
91 $name = "";
92 }
93 }
94 else {
95 $name = $name_or_cp;
96 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
97 $U_cp = charnames::vianame($name =~ s/_/ /gr);
632c9f80 98 }
61dad979 99
4a4b1311
KW
100 $cp = ($U_cp < 256)
101 ? $a2n[$U_cp]
102 : $U_cp;
ad88cddb 103
4a4b1311
KW
104 $name = $desired_name if $name eq "" && $desired_name;
105 $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes
61dad979 106
4a4b1311
KW
107 my $str;
108 my $suffix;
109 if (defined $flag && $flag eq 'native') {
110 die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
111 $suffix = '_NATIVE';
112 $str = sprintf "0x%02X", $cp; # Is a numeric constant
81a2a11f
KW
113 }
114 else {
4a4b1311
KW
115 $str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset);
116
117 $suffix = '_UTF8';
118 if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
119 $str = "\"$str\""; # Will be a string constant
120 } elsif ($flag eq 'tail') {
121 $str =~ s/\\x..//; # Remove the first byte
122 $suffix .= '_TAIL';
123 $str = "\"$str\""; # Will be a string constant
124 }
125 elsif ($flag eq 'first') {
126 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
127 $suffix .= '_FIRST_BYTE';
128 $str = "0x$str"; # Is a numeric constant
129 }
130 else {
131 die "Unknown flag at line $.: $_\n";
132 }
81a2a11f 133 }
4a4b1311 134 printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp;
a1beba5b 135 }
09cc440d
KW
136
137 my $max_PRINT_A = 0;
138 for my $i (0x20 .. 0x7E) {
139 $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
140 }
141 printf $out_fh "# define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 0x%02X /* The max code point that isPRINT_A */\n", $max_PRINT_A;
142
ad88cddb 143 print $out_fh "\n" . get_conditional_compile_line_end();
b35552de
KW
144
145}
146
147use Unicode::UCD 'prop_invlist';
148
149my $count = 0;
150my @other_invlist = prop_invlist("Other");
151for (my $i = 0; $i < @other_invlist; $i += 2) {
152 $count += ((defined $other_invlist[$i+1])
153 ? $other_invlist[$i+1]
154 : 0x110000)
155 - $other_invlist[$i];
61dad979 156}
b35552de
KW
157printf $out_fh "\n/* The number of code points not matching \\pC */\n"
158 . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n",
159 0x110000 - $count;
61dad979 160
3bfc1e70
KW
161# If this release has both the CWCM and CWCF properties, find the highest code
162# point which changes under any case change. We can use this to short-circuit
163# code
164my @cwcm = prop_invlist('CWCM');
165if (@cwcm) {
166 my @cwcf = prop_invlist('CWCF');
167 if (@cwcf) {
168 my $max = ($cwcm[-1] < $cwcf[-1])
169 ? $cwcf[-1]
170 : $cwcm[-1];
171 printf $out_fh "\n/* The highest code point that has any type of case change */\n"
172 . "#define HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C 0x%X\n",
173 $max - 1;
174 }
175}
176
1b0f46bf 177print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
d10c72f2 178
61dad979
KW
179read_only_bottom_close_and_rename($out_fh);
180
9d8e3074
KW
181# DATA FORMAT
182#
183# A blank line is output as-is.
184# Comments (lines whose first non-blank is a '#') are converted to C-style,
185# though empty comments are converted to blank lines. Otherwise, each line
186# represents one #define, and begins with either a Unicode character name with
187# the blanks and dashes in it squeezed out or replaced by underscores; or it
188# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
189# case, the name will be looked-up to use as the name of the macro. In either
190# case, the macro name will have suffixes as listed above, and all blanks and
191# dashes will be replaced by underscores.
192#
193# Each line may optionally have one of the following flags on it, separated by
194# white space from the initial token.
195# string indicates that the output is to be of the string form
196# described in the comments above that are placed in the file.
197# string_skip_ifundef is the same as 'string', but instead of dying if the
198# code point doesn't exist, the line is just skipped: no output is
199# generated for it
200# first indicates that the output is to be of the FIRST_BYTE form.
201# tail indicates that the output is of the _TAIL form.
202# native indicates that the output is the code point, converted to the
203# platform's native character set if applicable
204#
205# If the code point has no official name, the desired name may be appended
206# after the flag, which will be ignored if there is an official name.
207#
208# This program is used to make it convenient to create compile time constants
209# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
210# having to figure things out.
211
61dad979 212__DATA__
f2e06375 213U+017F string
76837d21 214
1dfa4f52 215U+0300 string
1dfa4f52 216
a78bc3c6
KW
217U+0399 string
218U+03BC string
219
8f57fa7d 220U+1E9E string_skip_if_undef
f2e06375 221
a9f50d33
KW
222U+FB05 string
223U+FB06 string
a0ffb25e
KW
224U+0130 string
225U+0131 string
a9f50d33 226
1dfa4f52 227U+2010 string
5f0aa340
KW
228BOM first
229BOM tail
525b6419 230
df758df2
KW
231NBSP native
232NBSP string
233
05016631 234DEL native
c5eda08a
KW
235CR native
236LF native
d804860b
KW
237VT native
238ESC native
1dfa4f52
KW
239U+00DF native
240U+00E5 native
241U+00C5 native
242U+00FF native
243U+00B5 native