[perl5.git] / regen / unicode_constants.pl

use v5.16.0;
use strict;
use warnings;
require 'regen/regen_lib.pl';
require 'regen/charset_translations.pl';
use charnames qw(:loose);

my $out_fh = open_new('unicode_constants.h', '>',
        {style => '*', by => $0,
                      from => "Unicode data"});

print $out_fh <<END;

#ifndef H_UNICODE_CONSTANTS   /* Guard against nested #includes */
#define H_UNICODE_CONSTANTS   1

/* This file contains #defines for various Unicode code points.  The values
 * the macros expand to are the native Unicode code point, or all or portions
 * of the UTF-8 encoding for the code point.  In the former case, the macro
 * name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
 *
 * The macros that have the suffix "_UTF8" may have further suffixes, as
 * follows:
 *  "_FIRST_BYTE" if the value is just the first byte of the UTF-8
 *                representation; the value will be a numeric constant.
 *  "_TAIL"       if instead it represents all but the first byte.  This, and
 *                with no additional suffix are both string constants */

END

# The data are at the end of this file.  A blank line is output as-is.
# Comments (lines whose first non-blank is a '#') are converted to C-style,
# though empty comments are converted to blank lines.  Otherwise, each line
# represents one #define, and begins with either a Unicode character name with
# the blanks and dashes in it squeezed out or replaced by underscores; or it
# may be a hexadecimal Unicode code point of the form U+xxxx.  In the latter
# case, the name will be looked-up to use as the name of the macro.  In either
# case, the macro name will have suffixes as listed above, and all blanks and
# dashes will be replaced by underscores.
#
# Each line may optionally have one of the following flags on it, separated by
# white space from the initial token.
#   string  indicates that the output is to be of the string form
#           described in the comments above that are placed in the file.
#   string_skip_ifundef  is the same as 'string', but instead of dying if the
#           code point doesn't exist, the line is just skipped: no output is
#           generated for it
#   first   indicates that the output is to be of the FIRST_BYTE form.
#   tail    indicates that the output is of the _TAIL form.
#   native  indicates that the output is the code point, converted to the
#           platform's native character set if applicable
#
# If the code point has no official name, the desired name may be appended
# after the flag, which will be ignored if there is an official name.
#
# This program is used to make it convenient to create compile time constants
# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
# having to figure things out.

my @data = <DATA>;

foreach my $charset (get_supported_code_pages()) {
    print $out_fh "\n" . get_conditional_compile_line_start($charset);

    my @a2n = @{get_a2n($charset)};

    for ( @data ) {
        chomp;

        # Convert any '#' comments to /* ... */; empty lines and comments are
        # output as blank lines
        if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
            my $comment_body = $1 // "";
            if ($comment_body ne "") {
                print $out_fh "/* $comment_body */\n";
            }
            else {
                print $out_fh "\n";
            }
            next;
        }

        unless ($_ =~ m/ ^ ( [^\ ]* )           # Name or code point token
                        (?: [\ ]+ ( [^ ]* ) )?  # optional flag
                        (?: [\ ]+ ( .* ) )?  # name if unnamed; flag is required
                    /x)
        {
            die "Unexpected syntax at line $.: $_\n";
        }

        my $name_or_cp = $1;
        my $flag = $2;
        my $desired_name = $3;

        my $name;
        my $cp;
        my $U_cp;   # code point in Unicode (not-native) terms
        my $undef_ok = $desired_name || $flag =~ /skip_if_undef/;

        if ($name_or_cp =~ /^U\+(.*)/) {
            $U_cp = hex $1;
            $name = charnames::viacode($name_or_cp);
            if (! defined $name) {
                die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $undef_ok;
                $name = "";
            }
        }
        else {
            $name = $name_or_cp;
            die "Unknown name '$name' at line $.: $_\n" unless defined $name;
            $U_cp = charnames::vianame($name =~ s/_/ /gr);
        }

        $cp = ($U_cp < 256)
            ? $a2n[$U_cp]
            : $U_cp;

        $name = $desired_name if $name eq "" && $desired_name;
        $name =~ s/[- ]/_/g;   # The macro name can have no blanks nor dashes

        my $str;
        my $suffix;
        if (defined $flag && $flag eq 'native') {
            die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
            $suffix = '_NATIVE';
            $str = sprintf "0x%02X", $cp;        # Is a numeric constant
        }
        else {
            $str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset);

            $suffix = '_UTF8';
            if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
                $str = "\"$str\"";  # Will be a string constant
            } elsif ($flag eq 'tail') {
                    $str =~ s/\\x..//;  # Remove the first byte
                    $suffix .= '_TAIL';
                    $str = "\"$str\"";  # Will be a string constant
            }
            elsif ($flag eq 'first') {
                $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
                $suffix .= '_FIRST_BYTE';
                $str = "0x$str";        # Is a numeric constant
            }
            else {
                die "Unknown flag at line $.: $_\n";
            }
        }
        printf $out_fh "#   define %s%s  %s    /* U+%04X */\n", $name, $suffix, $str, $U_cp;
    }

    my $max_PRINT_A = 0;
    for my $i (0x20 .. 0x7E) {
        $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
    }
    printf $out_fh "#   define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C   0x%02X   /* The max code point that isPRINT_A */\n", $max_PRINT_A;

    print $out_fh "\n" . get_conditional_compile_line_end();

}

use Unicode::UCD 'prop_invlist';

my $count = 0;
my @other_invlist = prop_invlist("Other");
for (my $i = 0; $i < @other_invlist; $i += 2) {
    $count += ((defined $other_invlist[$i+1])
              ? $other_invlist[$i+1]
              : 0x110000)
              - $other_invlist[$i];
}
printf $out_fh "\n/* The number of code points not matching \\pC */\n"
             . "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C  %d\n",
            0x110000 - $count;

print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";

read_only_bottom_close_and_rename($out_fh);

__DATA__
U+017F string

U+0300 string

U+0399 string
U+03BC string

U+1E9E string

U+FB05 string
U+FB06 string

U+2010 string
U+D800 first FIRST_SURROGATE
BOM first
BOM tail

NBSP native
NBSP string

DEL native
CR  native
LF  native
VT  native
ESC native
U+00DF native
U+00E5 native
U+00C5 native
U+00FF native
U+00B5 native
Commit	Line	Data
61dad979 KW	1	use v5.16.0;
	2	use strict;
	3	use warnings;
	4	require 'regen/regen_lib.pl';
ad88cddb	5	require 'regen/charset_translations.pl';
61dad979 KW	6	use charnames qw(:loose);
61dad979 KW	7
1b0f46bf	8	my $out_fh = open_new('unicode_constants.h', '>',
ad88cddb	9	{style => '*', by => $0,
61dad979 KW	10	from => "Unicode data"});
	11
	12	print $out_fh <<END;
d10c72f2	13
1b0f46bf KW	14	#ifndef H_UNICODE_CONSTANTS /* Guard against nested #includes */
1b0f46bf KW	15	#define H_UNICODE_CONSTANTS 1
d10c72f2	16
61dad979	17	/* This file contains #defines for various Unicode code points. The values
525b6419 KW	18	* the macros expand to are the native Unicode code point, or all or portions
	19	* of the UTF-8 encoding for the code point. In the former case, the macro
	20	* name has the suffix "_NATIVE"; otherwise, the suffix "_UTF8".
61dad979	21	*
525b6419 KW	22	* The macros that have the suffix "_UTF8" may have further suffixes, as
	23	* follows:
	24	* "_FIRST_BYTE" if the value is just the first byte of the UTF-8
	25	* representation; the value will be a numeric constant.
	26	* "_TAIL" if instead it represents all but the first byte. This, and
	27	* with no additional suffix are both string constants */
61dad979 KW	28
	29	END
	30
76837d21	31	# The data are at the end of this file. A blank line is output as-is.
5a731a17 KW	32	# Comments (lines whose first non-blank is a '#') are converted to C-style,
	33	# though empty comments are converted to blank lines. Otherwise, each line
	34	# represents one #define, and begins with either a Unicode character name with
	35	# the blanks and dashes in it squeezed out or replaced by underscores; or it
1dfa4f52	36	# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
76837d21	37	# case, the name will be looked-up to use as the name of the macro. In either
e9cddfae KW	38	# case, the macro name will have suffixes as listed above, and all blanks and
e9cddfae KW	39	# dashes will be replaced by underscores.
61dad979 KW	40	#
	41	# Each line may optionally have one of the following flags on it, separated by
	42	# white space from the initial token.
5f1720e9	43	# string indicates that the output is to be of the string form
61dad979	44	# described in the comments above that are placed in the file.
632c9f80 KW	45	# string_skip_ifundef is the same as 'string', but instead of dying if the
	46	# code point doesn't exist, the line is just skipped: no output is
	47	# generated for it
5f1720e9	48	# first indicates that the output is to be of the FIRST_BYTE form.
61dad979	49	# tail indicates that the output is of the _TAIL form.
525b6419 KW	50	# native indicates that the output is the code point, converted to the
525b6419 KW	51	# platform's native character set if applicable
61dad979	52	#
765ec46c KW	53	# If the code point has no official name, the desired name may be appended
	54	# after the flag, which will be ignored if there is an official name.
	55	#
61dad979 KW	56	# This program is used to make it convenient to create compile time constants
	57	# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
	58	# having to figure things out.
	59
ad88cddb KW	60	my @data = <DATA>;
	61
	62	foreach my $charset (get_supported_code_pages()) {
	63	print $out_fh "\n" . get_conditional_compile_line_start($charset);
	64
c30a0cf2	65	my @a2n = @{get_a2n($charset)};
ad88cddb	66
4a4b1311 KW	67	for ( @data ) {
	68	chomp;
	69
	70	# Convert any '#' comments to /* ... */; empty lines and comments are
	71	# output as blank lines
	72	if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
	73	my $comment_body = $1 // "";
	74	if ($comment_body ne "") {
	75	print $out_fh "/* $comment_body */\n";
	76	}
	77	else {
	78	print $out_fh "\n";
	79	}
	80	next;
5a731a17	81	}
76837d21	82
4a4b1311 KW	83	unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
	84	(?: [\ ]+ ( [^ ]* ) )? # optional flag
	85	(?: [\ ]+ ( .* ) )? # name if unnamed; flag is required
	86	/x)
	87	{
	88	die "Unexpected syntax at line $.: $_\n";
	89	}
61dad979	90
4a4b1311 KW	91	my $name_or_cp = $1;
	92	my $flag = $2;
	93	my $desired_name = $3;
	94
	95	my $name;
	96	my $cp;
	97	my $U_cp; # code point in Unicode (not-native) terms
	98	my $undef_ok = $desired_name \|\| $flag =~ /skip_if_undef/;
	99
	100	if ($name_or_cp =~ /^U\+(.*)/) {
	101	$U_cp = hex $1;
	102	$name = charnames::viacode($name_or_cp);
	103	if (! defined $name) {
	104	die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $undef_ok;
	105	$name = "";
	106	}
	107	}
	108	else {
	109	$name = $name_or_cp;
	110	die "Unknown name '$name' at line $.: $_\n" unless defined $name;
	111	$U_cp = charnames::vianame($name =~ s/_/ /gr);
632c9f80	112	}
61dad979	113
4a4b1311 KW	114	$cp = ($U_cp < 256)
	115	? $a2n[$U_cp]
	116	: $U_cp;
ad88cddb	117
4a4b1311 KW	118	$name = $desired_name if $name eq "" && $desired_name;
4a4b1311 KW	119	$name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes
61dad979	120
4a4b1311 KW	121	my $str;
	122	my $suffix;
	123	if (defined $flag && $flag eq 'native') {
	124	die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
	125	$suffix = '_NATIVE';
	126	$str = sprintf "0x%02X", $cp; # Is a numeric constant
81a2a11f KW	127	}
81a2a11f KW	128	else {
4a4b1311 KW	129	$str = join "", map { sprintf "\\x%02X", ord $_ } split //, cp_2_utfbytes($U_cp, $charset);
	130
	131	$suffix = '_UTF8';
	132	if (! defined $flag \|\| $flag =~ /^ string (_skip_if_undef)? $/x) {
	133	$str = "\"$str\""; # Will be a string constant
	134	} elsif ($flag eq 'tail') {
	135	$str =~ s/\\x..//; # Remove the first byte
	136	$suffix .= '_TAIL';
	137	$str = "\"$str\""; # Will be a string constant
	138	}
	139	elsif ($flag eq 'first') {
	140	$str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
	141	$suffix .= '_FIRST_BYTE';
	142	$str = "0x$str"; # Is a numeric constant
	143	}
	144	else {
	145	die "Unknown flag at line $.: $_\n";
	146	}
81a2a11f	147	}
4a4b1311	148	printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp;
a1beba5b	149	}
09cc440d KW	150
	151	my $max_PRINT_A = 0;
	152	for my $i (0x20 .. 0x7E) {
	153	$max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
	154	}
	155	printf $out_fh "# define MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C 0x%02X /* The max code point that isPRINT_A */\n", $max_PRINT_A;
	156
ad88cddb	157	print $out_fh "\n" . get_conditional_compile_line_end();
b35552de KW	158
	159	}
	160
	161	use Unicode::UCD 'prop_invlist';
	162
	163	my $count = 0;
	164	my @other_invlist = prop_invlist("Other");
	165	for (my $i = 0; $i < @other_invlist; $i += 2) {
	166	$count += ((defined $other_invlist[$i+1])
	167	? $other_invlist[$i+1]
	168	: 0x110000)
	169	- $other_invlist[$i];
61dad979	170	}
b35552de KW	171	printf $out_fh "\n/* The number of code points not matching \\pC */\n"
	172	. "#define NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C %d\n",
	173	0x110000 - $count;
61dad979	174
1b0f46bf	175	print $out_fh "\n#endif /* H_UNICODE_CONSTANTS */\n";
d10c72f2	176
61dad979 KW	177	read_only_bottom_close_and_rename($out_fh);
	178
	179	__DATA__
f2e06375	180	U+017F string
76837d21	181
1dfa4f52	182	U+0300 string
1dfa4f52	183
a78bc3c6 KW	184	U+0399 string
	185	U+03BC string
	186
f2e06375 KW	187	U+1E9E string
f2e06375 KW	188
a9f50d33 KW	189	U+FB05 string
	190	U+FB06 string
	191
1dfa4f52 KW	192	U+2010 string
1dfa4f52 KW	193	U+D800 first FIRST_SURROGATE
5f0aa340 KW	194	BOM first
5f0aa340 KW	195	BOM tail
525b6419	196
df758df2 KW	197	NBSP native
	198	NBSP string
	199
05016631	200	DEL native
c5eda08a KW	201	CR native
c5eda08a KW	202	LF native
d804860b KW	203	VT native
d804860b KW	204	ESC native
1dfa4f52 KW	205	U+00DF native
	206	U+00E5 native
	207	U+00C5 native
	208	U+00FF native
	209	U+00B5 native