This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/unicode_constants.pl: Move comments around
[perl5.git] / regen / unicode_constants.pl
CommitLineData
61dad979
KW
1use v5.16.0;
2use strict;
3use warnings;
4require 'regen/regen_lib.pl';
ad88cddb 5require 'regen/charset_translations.pl';
61dad979
KW
6use charnames qw(:loose);
7
1b0f46bf 8my $out_fh = open_new('unicode_constants.h', '>',
ad88cddb 9 {style => '*', by => $0,
61dad979
KW
10 from => "Unicode data"});
11
12print $out_fh <<END;
d10c72f2 13
1b0f46bf
KW
14#ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */
15#define H_UNICODE_CONSTANTS 1
d10c72f2 16
61dad979 17/* This file contains #defines for various Unicode code points. The values
525b6419
KW
18 * the macros expand to are the native Unicode code point, or all or portions
19 * of the UTF-8 encoding for the code point. In the former case, the macro
20 * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
61dad979 21 *
525b6419
KW
22 * The macros that have the suffix "_UTF8" may have further suffixes, as
23 * follows:
24 * "_FIRST_BYTE" if the value is just the first byte of the UTF-8
25 * representation; the value will be a numeric constant.
26 * "_TAIL" if instead it represents all but the first byte. This, and
27 * with no additional suffix are both string constants */
61dad979
KW
28
29END
30
9d8e3074 31# The data are at __DATA__ in this file.
61dad979 32
ad88cddb
KW
33my @data = <DATA>;
34
35foreach my $charset (get_supported_code_pages()) {
36 print $out_fh "\n" . get_conditional_compile_line_start($charset);
37
c30a0cf2 38 my @a2n = @{get_a2n($charset)};
ad88cddb 39
4a4b1311
KW
40 for ( @data ) {
41 chomp;
42
43 # Convert any '#' comments to /* ... */; empty lines and comments are
44 # output as blank lines
45 if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
46 my $comment_body = $1 // "";
47 if ($comment_body ne "") {
48 print $out_fh "/* $comment_body */\n";
49 }
50 else {
51 print $out_fh "\n";
52 }
53 next;
5a731a17 54 }
76837d21 55
4a4b1311
KW
56 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
57 (?: [\ ]+ ( [^ ]* ) )? # optional flag
58 (?: [\ ]+ ( .* ) )? # name if unnamed; flag is required
59 /x)
60 {
61 die "Unexpected syntax at line $.: $_\n";
62 }
61dad979 63
4a4b1311
KW
64 my $name_or_cp = $1;
65 my $flag = $2;
66 my $desired_name = $3;
67
68 my $name;
69 my $cp;
70 my $U_cp; # code point in Unicode (not-native) terms
71 my $undef_ok = $desired_name || $flag =~ /skip_if_undef/;
72
73 if ($name_or_cp =~ /^U\+(.*)/) {
74 $U_cp = hex $1;
75 $name = charnames::viacode($name_or_cp);
76 if (! defined $name) {
77 die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $undef_ok;
78 $name = "";
79 }
80 }
81 else {
82 $name = $name_or_cp;
83 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
84 $U_cp = charnames::vianame($name =~ s/_/ /gr);
632c9f80 85 }
61dad979 86
4a4b1311
KW
87 $cp = ($U_cp < 256)
88 ? $a2n[$U_cp]
89 : $U_cp;
ad88cddb 90
4a4b1311
KW
91 $name = $desired_name if $name eq "" && $desired_name;
92 $name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes
61dad979 93
4a4b1311
KW
94 my $str;
95 my $suffix;
96 if (defined $flag && $flag eq 'native') {
97 die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
98 $suffix = '_NATIVE';
99 $str = sprintf "0x%02X", $cp; # Is a numeric constant
81a2a11f
KW
100 }
101 else {
4a4b1311
KW
102 $str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset);
103
104 $suffix = '_UTF8';
105 if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
106 $str = "\"$str\""; # Will be a string constant
107 } elsif ($flag eq 'tail') {
108 $str =~ s/\\x..//; # Remove the first byte
109 $suffix .= '_TAIL';
110 $str = "\"$str\""; # Will be a string constant
111 }
112 elsif ($flag eq 'first') {
113 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
114 $suffix .= '_FIRST_BYTE';
115 $str = "0x$str"; # Is a numeric constant
116 }
117 else {
118 die "Unknown flag at line $.: $_\n";
119 }
81a2a11f 120 }
4a4b1311 121 printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp;
a1beba5b 122 }
09cc440d
KW
123
124 my $max_PRINT_A = 0;
125 for my $i (0x20 .. 0x7E) {
126 $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
127 }
128 printf $out_fh "# define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 0x%02X /* The max code point that isPRINT_A */\n", $max_PRINT_A;
129
ad88cddb 130 print $out_fh "\n" . get_conditional_compile_line_end();
b35552de
KW
131
132}
133
134use Unicode::UCD 'prop_invlist';
135
136my $count = 0;
137my @other_invlist = prop_invlist("Other");
138for (my $i = 0; $i < @other_invlist; $i += 2) {
139 $count += ((defined $other_invlist[$i+1])
140 ? $other_invlist[$i+1]
141 : 0x110000)
142 - $other_invlist[$i];
61dad979 143}
b35552de
KW
144printf $out_fh "\n/* The number of code points not matching \\pC */\n"
145 . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n",
146 0x110000 - $count;
61dad979 147
1b0f46bf 148print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
d10c72f2 149
61dad979
KW
150read_only_bottom_close_and_rename($out_fh);
151
9d8e3074
KW
152# DATA FORMAT
153#
154# A blank line is output as-is.
155# Comments (lines whose first non-blank is a '#') are converted to C-style,
156# though empty comments are converted to blank lines. Otherwise, each line
157# represents one #define, and begins with either a Unicode character name with
158# the blanks and dashes in it squeezed out or replaced by underscores; or it
159# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
160# case, the name will be looked-up to use as the name of the macro. In either
161# case, the macro name will have suffixes as listed above, and all blanks and
162# dashes will be replaced by underscores.
163#
164# Each line may optionally have one of the following flags on it, separated by
165# white space from the initial token.
166# string indicates that the output is to be of the string form
167# described in the comments above that are placed in the file.
168# string_skip_ifundef is the same as 'string', but instead of dying if the
169# code point doesn't exist, the line is just skipped: no output is
170# generated for it
171# first indicates that the output is to be of the FIRST_BYTE form.
172# tail indicates that the output is of the _TAIL form.
173# native indicates that the output is the code point, converted to the
174# platform's native character set if applicable
175#
176# If the code point has no official name, the desired name may be appended
177# after the flag, which will be ignored if there is an official name.
178#
179# This program is used to make it convenient to create compile time constants
180# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
181# having to figure things out.
182
61dad979 183__DATA__
f2e06375 184U+017F string
76837d21 185
1dfa4f52 186U+0300 string
1dfa4f52 187
a78bc3c6
KW
188U+0399 string
189U+03BC string
190
f2e06375
KW
191U+1E9E string
192
a9f50d33
KW
193U+FB05 string
194U+FB06 string
195
1dfa4f52
KW
196U+2010 string
197U+D800 first FIRST_SURROGATE
5f0aa340
KW
198BOM first
199BOM tail
525b6419 200
df758df2
KW
201NBSP native
202NBSP string
203
05016631 204DEL native
c5eda08a
KW
205CR native
206LF native
d804860b
KW
207VT native
208ESC native
1dfa4f52
KW
209U+00DF native
210U+00E5 native
211U+00C5 native
212U+00FF native
213U+00B5 native