[perl5.git] / regen / unicode_constants.pl

use v5.16.0;
use strict;
use warnings;
require 'regen/regen_lib.pl';
require 'regen/charset_translations.pl';
use Unicode::UCD;
use charnames qw(:loose);

my $out_fh = open_new('unicode_constants.h', '>',
        {style => '*', by => $0,
                      from => "Unicode data"});

print $out_fh <<END;

#ifndef H_UNICODE_CONSTANTS   /* Guard against nested #includes */
#define H_UNICODE_CONSTANTS   1

/* This file contains #defines for the version of Unicode being used and
 * various Unicode code points.  The values the code point macros expand to
 * are the native Unicode code point, or all or portions of the UTF-8 encoding
 * for the code point.  In the former case, the macro name has the suffix
 * "_NATIVE"; otherwise, the suffix "_UTF8".
 *
 * The macros that have the suffix "_UTF8" may have further suffixes, as
 * follows:
 *  "_FIRST_BYTE" if the value is just the first byte of the UTF-8
 *                representation; the value will be a numeric constant.
 *  "_TAIL"       if instead it represents all but the first byte.  This, and
 *                with no additional suffix are both string constants */

END

my $version = Unicode::UCD::UnicodeVersion();
my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x;
$dotdot = 0 unless defined $dotdot;

print $out_fh <<END;
#define UNICODE_MAJOR_VERSION   $major
#define UNICODE_DOT_VERSION     $dot
#define UNICODE_DOT_DOT_VERSION $dotdot

END

# The data are at __DATA__  in this file.

my @data = <DATA>;

foreach my $charset (get_supported_code_pages()) {
    print $out_fh "\n" . get_conditional_compile_line_start($charset);

    my @a2n = @{get_a2n($charset)};

    for ( @data ) {
        chomp;

        # Convert any '#' comments to /* ... */; empty lines and comments are
        # output as blank lines
        if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
            my $comment_body = $1 // "";
            if ($comment_body ne "") {
                print $out_fh "/* $comment_body */\n";
            }
            else {
                print $out_fh "\n";
            }
            next;
        }

        unless ($_ =~ m/ ^ ( [^\ ]* )           # Name or code point token
                        (?: [\ ]+ ( [^ ]* ) )?  # optional flag
                        (?: [\ ]+ ( .* ) )?  # name if unnamed; flag is required
                    /x)
        {
            die "Unexpected syntax at line $.: $_\n";
        }

        my $name_or_cp = $1;
        my $flag = $2;
        my $desired_name = $3;

        my $name;
        my $cp;
        my $U_cp;   # code point in Unicode (not-native) terms

        if ($name_or_cp =~ /^U\+(.*)/) {
            $U_cp = hex $1;
            $name = charnames::viacode($name_or_cp);
            if (! defined $name) {
                next if $flag =~ /skip_if_undef/;
                die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
                $name = "";
            }
        }
        else {
            $name = $name_or_cp;
            die "Unknown name '$name' at line $.: $_\n" unless defined $name;
            $U_cp = charnames::vianame($name =~ s/_/ /gr);
        }

        $cp = ($U_cp < 256)
            ? $a2n[$U_cp]
            : $U_cp;

        $name = $desired_name if $name eq "" && $desired_name;
        $name =~ s/[- ]/_/g;   # The macro name can have no blanks nor dashes

        my $str;
        my $suffix;
        if (defined $flag && $flag eq 'native') {
            die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
            $suffix = '_NATIVE';
            $str = sprintf "0x%02X", $cp;        # Is a numeric constant
        }
        else {
            $str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset);

            $suffix = '_UTF8';
            if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
                $str = "\"$str\"";  # Will be a string constant
            } elsif ($flag eq 'tail') {
                    $str =~ s/\\x..//;  # Remove the first byte
                    $suffix .= '_TAIL';
                    $str = "\"$str\"";  # Will be a string constant
            }
            elsif ($flag eq 'first') {
                $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
                $suffix .= '_FIRST_BYTE';
                $str = "0x$str";        # Is a numeric constant
            }
            else {
                die "Unknown flag at line $.: $_\n";
            }
        }
        printf $out_fh "#   define %s%s  %s    /* U+%04X */\n", $name, $suffix, $str, $U_cp;
    }

    my $max_PRINT_A = 0;
    for my $i (0x20 .. 0x7E) {
        $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
    }
    printf $out_fh "#   define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C   0x%02X   /* The max code point that isPRINT_A */\n", $max_PRINT_A;

    print $out_fh "\n" . get_conditional_compile_line_end();

}

use Unicode::UCD 'prop_invlist';

my $count = 0;
my @other_invlist = prop_invlist("Other");
for (my $i = 0; $i < @other_invlist; $i += 2) {
    $count += ((defined $other_invlist[$i+1])
              ? $other_invlist[$i+1]
              : 0x110000)
              - $other_invlist[$i];
}
printf $out_fh "\n/* The number of code points not matching \\pC */\n"
             . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C  %d\n",
            0x110000 - $count;

# If this release has both the CWCM and CWCF properties, find the highest code
# point which changes under any case change.  We can use this to short-circuit
# code
my @cwcm = prop_invlist('CWCM');
if (@cwcm) {
    my @cwcf = prop_invlist('CWCF');
    if (@cwcf) {
        my $max = ($cwcm[-1] < $cwcf[-1])
                  ? $cwcf[-1]
                  : $cwcm[-1];
        printf $out_fh "\n/* The highest code point that has any type of case change */\n"
             . "#define HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C  0x%X\n",
            $max - 1;
    }
}

print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";

read_only_bottom_close_and_rename($out_fh);

# DATA FORMAT
#
# A blank line is output as-is.
# Comments (lines whose first non-blank is a '#') are converted to C-style,
# though empty comments are converted to blank lines.  Otherwise, each line
# represents one #define, and begins with either a Unicode character name with
# the blanks and dashes in it squeezed out or replaced by underscores; or it
# may be a hexadecimal Unicode code point of the form U+xxxx.  In the latter
# case, the name will be looked-up to use as the name of the macro.  In either
# case, the macro name will have suffixes as listed above, and all blanks and
# dashes will be replaced by underscores.
#
# Each line may optionally have one of the following flags on it, separated by
# white space from the initial token.
#   string  indicates that the output is to be of the string form
#           described in the comments above that are placed in the file.
#   string_skip_ifundef  is the same as 'string', but instead of dying if the
#           code point doesn't exist, the line is just skipped: no output is
#           generated for it
#   first   indicates that the output is to be of the FIRST_BYTE form.
#   tail    indicates that the output is of the _TAIL form.
#   native  indicates that the output is the code point, converted to the
#           platform's native character set if applicable
#
# If the code point has no official name, the desired name may be appended
# after the flag, which will be ignored if there is an official name.
#
# This program is used to make it convenient to create compile time constants
# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
# having to figure things out.

__DATA__
U+017F string

U+0300 string

U+0399 string
U+03BC string

U+1E9E string_skip_if_undef

U+FB05 string
U+FB06 string
U+0130 string
U+0131 string

U+2010 string
BOM first
BOM tail

NBSP native
NBSP string

DEL native
CR  native
LF  native
VT  native
ESC native
U+00DF native
U+00E5 native
U+00C5 native
U+00FF native
U+00B5 native
Commit	Line	Data
61dad979 KW	1	use v5.16.0;
	2	use strict;
	3	use warnings;
	4	require 'regen/regen_lib.pl';
ad88cddb	5	require 'regen/charset_translations.pl';
4b4853d1	6	use Unicode::UCD;
61dad979 KW	7	use charnames qw(:loose);
61dad979 KW	8
1b0f46bf	9	my $out_fh = open_new('unicode_constants.h', '>',
ad88cddb	10	{style => '*', by => $0,
61dad979 KW	11	from => "Unicode data"});
	12
	13	print $out_fh <<END;
d10c72f2	14
1b0f46bf KW	15	#ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */
1b0f46bf KW	16	#define H_UNICODE_CONSTANTS 1
d10c72f2	17
4b4853d1 KW	18	/* This file contains #defines for the version of Unicode being used and
	19	* various Unicode code points. The values the code point macros expand to
	20	* are the native Unicode code point, or all or portions of the UTF-8 encoding
	21	* for the code point. In the former case, the macro name has the suffix
	22	* "_NATIVE"; otherwise, the suffix "_UTF8".
61dad979	23	*
525b6419 KW	24	* The macros that have the suffix "_UTF8" may have further suffixes, as
	25	* follows:
	26	* "_FIRST_BYTE" if the value is just the first byte of the UTF-8
	27	* representation; the value will be a numeric constant.
	28	* "_TAIL" if instead it represents all but the first byte. This, and
	29	* with no additional suffix are both string constants */
61dad979 KW	30
	31	END
	32
4b4853d1 KW	33	my $version = Unicode::UCD::UnicodeVersion();
	34	my ($major, $dot, $dotdot) = $version =~ / (.?) \. (.?) (?: \. (.*) )? $ /x;
	35	$dotdot = 0 unless defined $dotdot;
	36
	37	print $out_fh <<END;
	38	#define UNICODE_MAJOR_VERSION $major
	39	#define UNICODE_DOT_VERSION $dot
	40	#define UNICODE_DOT_DOT_VERSION $dotdot
	41
	42	END
	43
9d8e3074	44	# The data are at __DATA__ in this file.
61dad979	45
ad88cddb KW	46	my @data = <DATA>;
	47
	48	foreach my $charset (get_supported_code_pages()) {
	49	print $out_fh "\n" . get_conditional_compile_line_start($charset);
	50
c30a0cf2	51	my @a2n = @{get_a2n($charset)};
ad88cddb	52
4a4b1311 KW	53	for ( @data ) {
	54	chomp;
	55
	56	# Convert any '#' comments to /* ... */; empty lines and comments are
	57	# output as blank lines
	58	if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
	59	my $comment_body = $1 // "";
	60	if ($comment_body ne "") {
	61	print $out_fh "/* $comment_body */\n";
	62	}
	63	else {
	64	print $out_fh "\n";
	65	}
	66	next;
5a731a17	67	}
76837d21	68
4a4b1311 KW	69	unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
	70	(?: [\ ]+ ( [^ ]* ) )? # optional flag
	71	(?: [\ ]+ ( .* ) )? # name if unnamed; flag is required
	72	/x)
	73	{
	74	die "Unexpected syntax at line $.: $_\n";
	75	}
61dad979	76
4a4b1311 KW	77	my $name_or_cp = $1;
	78	my $flag = $2;
	79	my $desired_name = $3;
	80
	81	my $name;
	82	my $cp;
	83	my $U_cp; # code point in Unicode (not-native) terms
4a4b1311 KW	84
	85	if ($name_or_cp =~ /^U\+(.*)/) {
	86	$U_cp = hex $1;
	87	$name = charnames::viacode($name_or_cp);
	88	if (! defined $name) {
280ac755 KW	89	next if $flag =~ /skip_if_undef/;
280ac755 KW	90	die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
4a4b1311 KW	91	$name = "";
	92	}
	93	}
	94	else {
	95	$name = $name_or_cp;
	96	die "Unknown name '$name' at line $.: $_\n" unless defined $name;
	97	$U_cp = charnames::vianame($name =~ s/_/ /gr);
632c9f80	98	}
61dad979	99
4a4b1311 KW	100	$cp = ($U_cp < 256)
	101	? $a2n[$U_cp]
	102	: $U_cp;
ad88cddb	103
4a4b1311 KW	104	$name = $desired_name if $name eq "" && $desired_name;
4a4b1311 KW	105	$name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes
61dad979	106
4a4b1311 KW	107	my $str;
	108	my $suffix;
	109	if (defined $flag && $flag eq 'native') {
	110	die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
	111	$suffix = '_NATIVE';
	112	$str = sprintf "0x%02X", $cp; # Is a numeric constant
81a2a11f KW	113	}
81a2a11f KW	114	else {
4a4b1311 KW	115	$str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset);
	116
	117	$suffix = '_UTF8';
	118	if (! defined $flag \|\| $flag =~ /^ string (_skip_if_undef)? $/x) {
	119	$str = "\"$str\""; # Will be a string constant
	120	} elsif ($flag eq 'tail') {
	121	$str =~ s/\\x..//; # Remove the first byte
	122	$suffix .= '_TAIL';
	123	$str = "\"$str\""; # Will be a string constant
	124	}
	125	elsif ($flag eq 'first') {
	126	$str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
	127	$suffix .= '_FIRST_BYTE';
	128	$str = "0x$str"; # Is a numeric constant
	129	}
	130	else {
	131	die "Unknown flag at line $.: $_\n";
	132	}
81a2a11f	133	}
4a4b1311	134	printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp;
a1beba5b	135	}
09cc440d KW	136
	137	my $max_PRINT_A = 0;
	138	for my $i (0x20 .. 0x7E) {
	139	$max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
	140	}
	141	printf $out_fh "# define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 0x%02X /* The max code point that isPRINT_A */\n", $max_PRINT_A;
	142
ad88cddb	143	print $out_fh "\n" . get_conditional_compile_line_end();
b35552de KW	144
	145	}
	146
	147	use Unicode::UCD 'prop_invlist';
	148
	149	my $count = 0;
	150	my @other_invlist = prop_invlist("Other");
	151	for (my $i = 0; $i < @other_invlist; $i += 2) {
	152	$count += ((defined $other_invlist[$i+1])
	153	? $other_invlist[$i+1]
	154	: 0x110000)
	155	- $other_invlist[$i];
61dad979	156	}
b35552de KW	157	printf $out_fh "\n/* The number of code points not matching \\pC */\n"
	158	. "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n",
	159	0x110000 - $count;
61dad979	160
3bfc1e70 KW	161	# If this release has both the CWCM and CWCF properties, find the highest code
	162	# point which changes under any case change. We can use this to short-circuit
	163	# code
	164	my @cwcm = prop_invlist('CWCM');
	165	if (@cwcm) {
	166	my @cwcf = prop_invlist('CWCF');
	167	if (@cwcf) {
	168	my $max = ($cwcm[-1] < $cwcf[-1])
	169	? $cwcf[-1]
	170	: $cwcm[-1];
	171	printf $out_fh "\n/* The highest code point that has any type of case change */\n"
	172	. "#define HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C 0x%X\n",
	173	$max - 1;
	174	}
	175	}
	176
1b0f46bf	177	print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
d10c72f2	178
61dad979 KW	179	read_only_bottom_close_and_rename($out_fh);
61dad979 KW	180
9d8e3074 KW	181	# DATA FORMAT
	182	#
	183	# A blank line is output as-is.
	184	# Comments (lines whose first non-blank is a '#') are converted to C-style,
	185	# though empty comments are converted to blank lines. Otherwise, each line
	186	# represents one #define, and begins with either a Unicode character name with
	187	# the blanks and dashes in it squeezed out or replaced by underscores; or it
	188	# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
	189	# case, the name will be looked-up to use as the name of the macro. In either
	190	# case, the macro name will have suffixes as listed above, and all blanks and
	191	# dashes will be replaced by underscores.
	192	#
	193	# Each line may optionally have one of the following flags on it, separated by
	194	# white space from the initial token.
	195	# string indicates that the output is to be of the string form
	196	# described in the comments above that are placed in the file.
	197	# string_skip_ifundef is the same as 'string', but instead of dying if the
	198	# code point doesn't exist, the line is just skipped: no output is
	199	# generated for it
	200	# first indicates that the output is to be of the FIRST_BYTE form.
	201	# tail indicates that the output is of the _TAIL form.
	202	# native indicates that the output is the code point, converted to the
	203	# platform's native character set if applicable
	204	#
	205	# If the code point has no official name, the desired name may be appended
	206	# after the flag, which will be ignored if there is an official name.
	207	#
	208	# This program is used to make it convenient to create compile time constants
	209	# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
	210	# having to figure things out.
	211
61dad979	212	__DATA__
f2e06375	213	U+017F string
76837d21	214
1dfa4f52	215	U+0300 string
1dfa4f52	216
a78bc3c6 KW	217	U+0399 string
	218	U+03BC string
	219
8f57fa7d	220	U+1E9E string_skip_if_undef
f2e06375	221
a9f50d33 KW	222	U+FB05 string
a9f50d33 KW	223	U+FB06 string
a0ffb25e KW	224	U+0130 string
a0ffb25e KW	225	U+0131 string
a9f50d33	226
1dfa4f52	227	U+2010 string
5f0aa340 KW	228	BOM first
5f0aa340 KW	229	BOM tail
525b6419	230
df758df2 KW	231	NBSP native
	232	NBSP string
	233
05016631	234	DEL native
c5eda08a KW	235	CR native
c5eda08a KW	236	LF native
d804860b KW	237	VT native
d804860b KW	238	ESC native
1dfa4f52 KW	239	U+00DF native
	240	U+00E5 native
	241	U+00C5 native
	242	U+00FF native
	243	U+00B5 native