This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
RMG - Consistent four-space indent; wrap all lines to 79 characters
[perl5.git] / regen / unicode_constants.pl
CommitLineData
61dad979
KW
1use v5.16.0;
2use strict;
3use warnings;
4require 'regen/regen_lib.pl';
ad88cddb 5require 'regen/charset_translations.pl';
61dad979
KW
6use charnames qw(:loose);
7
1b0f46bf 8my $out_fh = open_new('unicode_constants.h', '>',
ad88cddb 9 {style => '*', by => $0,
61dad979
KW
10 from => "Unicode data"});
11
12print $out_fh <<END;
d10c72f2 13
1b0f46bf
KW
14#ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */
15#define H_UNICODE_CONSTANTS 1
d10c72f2 16
61dad979 17/* This file contains #defines for various Unicode code points. The values
525b6419
KW
18 * the macros expand to are the native Unicode code point, or all or portions
19 * of the UTF-8 encoding for the code point. In the former case, the macro
20 * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
61dad979 21 *
525b6419
KW
22 * The macros that have the suffix "_UTF8" may have further suffixes, as
23 * follows:
24 * "_FIRST_BYTE" if the value is just the first byte of the UTF-8
25 * representation; the value will be a numeric constant.
26 * "_TAIL" if instead it represents all but the first byte. This, and
27 * with no additional suffix are both string constants */
61dad979
KW
28
29END
30
76837d21 31# The data are at the end of this file. A blank line is output as-is.
5a731a17
KW
32# Comments (lines whose first non-blank is a '#') are converted to C-style,
33# though empty comments are converted to blank lines. Otherwise, each line
34# represents one #define, and begins with either a Unicode character name with
35# the blanks and dashes in it squeezed out or replaced by underscores; or it
1dfa4f52 36# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
76837d21 37# case, the name will be looked-up to use as the name of the macro. In either
e9cddfae
KW
38# case, the macro name will have suffixes as listed above, and all blanks and
39# dashes will be replaced by underscores.
61dad979
KW
40#
41# Each line may optionally have one of the following flags on it, separated by
42# white space from the initial token.
5f1720e9 43# string indicates that the output is to be of the string form
61dad979 44# described in the comments above that are placed in the file.
632c9f80
KW
45# string_skip_ifundef is the same as 'string', but instead of dying if the
46# code point doesn't exist, the line is just skipped: no output is
47# generated for it
5f1720e9 48# first indicates that the output is to be of the FIRST_BYTE form.
61dad979 49# tail indicates that the output is of the _TAIL form.
525b6419
KW
50# native indicates that the output is the code point, converted to the
51# platform's native character set if applicable
61dad979 52#
765ec46c
KW
53# If the code point has no official name, the desired name may be appended
54# after the flag, which will be ignored if there is an official name.
55#
61dad979
KW
56# This program is used to make it convenient to create compile time constants
57# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
58# having to figure things out.
59
ad88cddb
KW
60my @data = <DATA>;
61
62foreach my $charset (get_supported_code_pages()) {
63 print $out_fh "\n" . get_conditional_compile_line_start($charset);
64
c30a0cf2 65 my @a2n = @{get_a2n($charset)};
ad88cddb 66
4a4b1311
KW
67 for ( @data ) {
68 chomp;
69
70 # Convert any '#' comments to /* ... */; empty lines and comments are
71 # output as blank lines
72 if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
73 my $comment_body = $1 // "";
74 if ($comment_body ne "") {
75 print $out_fh "/* $comment_body */\n";
76 }
77 else {
78 print $out_fh "\n";
79 }
80 next;
5a731a17 81 }
76837d21 82
4a4b1311
KW
83 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
84 (?: [\ ]+ ( [^ ]* ) )? # optional flag
85 (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required
86 /x)
87 {
88 die "Unexpected syntax at line $.: $_\n";
89 }
61dad979 90
4a4b1311
KW
91 my $name_or_cp = $1;
92 my $flag = $2;
93 my $desired_name = $3;
94
95 my $name;
96 my $cp;
97 my $U_cp; # code point in Unicode (not-native) terms
98 my $undef_ok = $desired_name || $flag =~ /skip_if_undef/;
99
100 if ($name_or_cp =~ /^U\+(.*)/) {
101 $U_cp = hex $1;
102 $name = charnames::viacode($name_or_cp);
103 if (! defined $name) {
104 die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $undef_ok;
105 $name = "";
106 }
107 }
108 else {
109 $name = $name_or_cp;
110 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
111 $U_cp = charnames::vianame($name =~ s/_/ /gr);
632c9f80 112 }
61dad979 113
4a4b1311
KW
114 $cp = ($U_cp < 256)
115 ? $a2n[$U_cp]
116 : $U_cp;
ad88cddb 117
4a4b1311
KW
118 $name = $desired_name if $name eq "" && $desired_name;
119 $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes
61dad979 120
4a4b1311
KW
121 my $str;
122 my $suffix;
123 if (defined $flag && $flag eq 'native') {
124 die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
125 $suffix = '_NATIVE';
126 $str = sprintf "0x%02X", $cp; # Is a numeric constant
81a2a11f
KW
127 }
128 else {
4a4b1311
KW
129 $str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset);
130
131 $suffix = '_UTF8';
132 if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
133 $str = "\"$str\""; # Will be a string constant
134 } elsif ($flag eq 'tail') {
135 $str =~ s/\\x..//; # Remove the first byte
136 $suffix .= '_TAIL';
137 $str = "\"$str\""; # Will be a string constant
138 }
139 elsif ($flag eq 'first') {
140 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
141 $suffix .= '_FIRST_BYTE';
142 $str = "0x$str"; # Is a numeric constant
143 }
144 else {
145 die "Unknown flag at line $.: $_\n";
146 }
81a2a11f 147 }
4a4b1311 148 printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp;
a1beba5b 149 }
09cc440d
KW
150
151 my $max_PRINT_A = 0;
152 for my $i (0x20 .. 0x7E) {
153 $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
154 }
155 printf $out_fh "# define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 0x%02X /* The max code point that isPRINT_A */\n", $max_PRINT_A;
156
ad88cddb 157 print $out_fh "\n" . get_conditional_compile_line_end();
b35552de
KW
158
159}
160
161use Unicode::UCD 'prop_invlist';
162
163my $count = 0;
164my @other_invlist = prop_invlist("Other");
165for (my $i = 0; $i < @other_invlist; $i += 2) {
166 $count += ((defined $other_invlist[$i+1])
167 ? $other_invlist[$i+1]
168 : 0x110000)
169 - $other_invlist[$i];
61dad979 170}
b35552de
KW
171printf $out_fh "\n/* The number of code points not matching \\pC */\n"
172 . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n",
173 0x110000 - $count;
61dad979 174
1b0f46bf 175print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
d10c72f2 176
61dad979
KW
177read_only_bottom_close_and_rename($out_fh);
178
179__DATA__
f2e06375 180U+017F string
76837d21 181
1dfa4f52 182U+0300 string
1dfa4f52 183
a78bc3c6
KW
184U+0399 string
185U+03BC string
186
f2e06375
KW
187U+1E9E string
188
a9f50d33
KW
189U+FB05 string
190U+FB06 string
191
1dfa4f52
KW
192U+2010 string
193U+D800 first FIRST_SURROGATE
5f0aa340
KW
194BOM first
195BOM tail
525b6419 196
df758df2
KW
197NBSP native
198NBSP string
199
05016631 200DEL native
c5eda08a
KW
201CR native
202LF native
d804860b
KW
203VT native
204ESC native
1dfa4f52
KW
205U+00DF native
206U+00E5 native
207U+00C5 native
208U+00FF native
209U+00B5 native