[perl5.git] / regen / unicode_constants.pl

use v5.16.0;
use strict;
use warnings;
no warnings 'experimental::regex_sets';
require './regen/regen_lib.pl';
require './regen/charset_translations.pl';
use Unicode::UCD qw(prop_invlist prop_invmap);
use charnames qw(:loose);
binmode(STDERR, ":utf8");

# Set this to 1 temporarily to get on stderr the complete list of paired
# string delimiters this generates.  This list is suitable for plugging into a
# pod.
my $output_lists = 0;

my $out_fh = open_new('unicode_constants.h', '>',
        {style => '*', by => $0,
                      from => "Unicode data"});

print $out_fh <<END;

#ifndef PERL_UNICODE_CONSTANTS_H_   /* Guard against nested #includes */
#define PERL_UNICODE_CONSTANTS_H_   1

/* This file contains #defines for the version of Unicode being used and
 * various Unicode code points.  The values the code point macros expand to
 * are the native Unicode code point, or all or portions of the UTF-8 encoding
 * for the code point.  In the former case, the macro name has the suffix
 * "_NATIVE"; otherwise, the suffix "_UTF8".
 *
 * The macros that have the suffix "_UTF8" may have further suffixes, as
 * follows:
 *  "_FIRST_BYTE" if the value is just the first byte of the UTF-8
 *                representation; the value will be a numeric constant.
 *  "_TAIL"       if instead it represents all but the first byte.  This, and
 *                with no additional suffix are both string constants */

/*
=for apidoc_section \$unicode

=for apidoc AmnU|const char *|BOM_UTF8

This is a macro that evaluates to a string constant of the  UTF-8 bytes that
define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl
is compiled on.  This allows code to use a mnemonic for this character that
works on both ASCII and EBCDIC platforms.
S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in
bytes.

=for apidoc AmnU|const char *|REPLACEMENT_CHARACTER_UTF8

This is a macro that evaluates to a string constant of the  UTF-8 bytes that
define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl
is compiled on.  This allows code to use a mnemonic for this character that
works on both ASCII and EBCDIC platforms.
S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in
bytes.

=cut
*/

END

sub backslash_x_form($$;$) {
    # Output the code point represented by the byte string $bytes as a
    # sequence of \x{} constants.  $bytes should be the UTF-8 for the code
    # point if the final parameter is absent or empty.  Otherwise it should be
    # the Latin1 code point itself.
    #
    # The output is translated into the character set '$charset'.

    my ($bytes, $charset, $non_utf8) = @_;
    if ($non_utf8) {
        die "Must be utf8 if above 255" if $bytes > 255;
        my $a2n = get_a2n($charset);
        return sprintf "\\x%02X", $a2n->[$bytes];
    }
    else {
        return join "", map { sprintf "\\x%02X", ord $_ }
                        split //, cp_2_utfbytes($bytes, $charset);
    }
}


my %opposite_of = ( LEFT => 'RIGHT', RIGHT =>'LEFT' );

my $directional_re = qr/\b(LEFT|RIGHT)\b/;    # Make sure to capture $1

sub format_pairs_line($$) {
    my ($from, $to) = @_;

    # Format a line containing a character pair in preparation
    # for output, suitable for pod.

    my $lhs_name = charnames::viacode($from);
    my $lhs_hex = sprintf "%04X", $from;
    my $rhs_name;
    my $rhs_hex;
    my $name = $lhs_name;

    my $hanging_indent = 26;

    if (defined $to) {
        my $rhs_name = charnames::viacode($to);
        $rhs_hex = sprintf "%04X", $to;

        # Most of the names differ only in LEFT vs RIGHT; some in
        # LESS-THAN vs GREATER-THAN.  It takes less space, and is easier to
        # understand if they are displayed combined.
        if ($name =~ s/$directional_re/$opposite_of{$1}/gr eq $rhs_name) {
            $name =~ s,$directional_re,$1/$opposite_of{$1},g;
        }
        else {  # Otherwise, display them sequentially
            $name .= ",  " . $rhs_name;
        }
    }

    # Handle double-width characters, based on the East Asian Width property.
    # Add an extra space to non-wide ones so things stay vertically aligned.
    my $extra = 0;
    my $output_line = " "   # Indent in case output being used for verbatim
                            # pod
                    . chr $from;
    if (chr($from) =~ /[\p{EA=W}\p{EA=F}]/) {
        $extra++;       # The length() will be shorter than the displayed
                        # width
    }
    else {
        $output_line .= " ";
    }
    if (defined $to) {
        $output_line .= " " . chr $to;
        if (chr($to) =~ /[\p{EA=W}\p{EA=F}]/) {
            $extra++;
        }
        else {
            $output_line .= " ";
        }
    }
    else {
        $output_line .= "   ";
    }

    $output_line .= "   U+$lhs_hex";
    $output_line .= ", U+$rhs_hex" if defined $to;;
    my $cur_len = $extra + length $output_line;
    $output_line .= " " x ($hanging_indent - $cur_len);

    my $max_len = 74;   # Pod formatter will indent 4 spaces
    $cur_len = length $output_line;

    if ($cur_len + length $name <= $max_len) {
        $output_line .= $name;  # It will fit
    }
    else {  # It won't fit.  Append a segment that is unbreakable until would
            # exceed the available width; then start on a new line
            # Doesn't handle the case where the whole segment doesn't fit;
            # this just doesn't come up with the input data.
        while ($name =~ / ( .+? ) \b{lb} /xg) {
            my $segment = $1;
            my $added_length = length $segment;
            if ($cur_len + $added_length > $max_len) {
                $output_line =~ s/ +$//;
                $output_line .= "\n" . " " x $hanging_indent;
                $cur_len = $hanging_indent;
            }

            $output_line .= $segment;
            $cur_len += $added_length;
        }
    }

    return $output_line . "\n";
}

my $version = Unicode::UCD::UnicodeVersion();
my ($major, $dot, $dotdot) = $version =~ / (.*?) \. (.*?) (?: \. (.*) )? $ /x;
$dotdot = 0 unless defined $dotdot;

print $out_fh <<END;
#define UNICODE_MAJOR_VERSION   $major
#define UNICODE_DOT_VERSION     $dot
#define UNICODE_DOT_DOT_VERSION $dotdot

END

# Gather the characters in Unicode that have left/right symmetry suitable for
# paired string delimiters
my %paireds = ( ord '<' =>  ord '>' );     # We don't normally use math ones, but
                                        # this is traditionally included

# This property is the universe of all characters in Unicode which
# are of some import to the Bidirectional Algorithm, and for which there is
# another Unicode character that is a mirror of it.
my ($bmg_invlist, $bmg_invmap, $format, $bmg_default) =
                                            prop_invmap("Bidi_Mirroring_Glyph");

# The current list of characters that Perl considers to be paired
# opening/closing delimiters is quite conservative, consisting of those
# from the above property that other Unicode properties classify as
# opening/closing.

# Find the ones in the bmg list that Unicode thinks are opening ones.
for (my $i = 0; $i < $bmg_invlist->@*; $i++) {
    my $mirror_code_point = $bmg_invmap->[$i];
    next if $mirror_code_point eq $bmg_default;   # Doesn't map to a character.

    my $code_point = $bmg_invlist->[$i];

    # Bidi_Paired_Bracket_Type=Open and General_Category=Open_Punctuation are
    # definitely in the list.  It is language-dependent whether members of
    # General_Category=Initial_Punctuation are considered opening or closing;
    # we allow either to be at the front
    if (chr($code_point) =~ /(?[ \p{BPT=Open}
                               | \p{Gc=Open_Punctuation}
                               | \p{Gc=Initial_Punctuation}
                            ])/)
    {
        $paireds{$code_point} = $mirror_code_point;
    }

    if (chr($code_point) =~ /\p{Gc=Initial_Punctuation}/) {
        $paireds{$mirror_code_point} = $code_point;
    }
}

# There are several hundred characters other characters that clearly should be
# mirrors of each other, like LEFTWARDS ARROW and RIGHTWARDS ARROW.  Unicode
# did not bother to classify them as mirrors mostly because they aren't of
# import in the Bidirectional Algorithm.  Most of them are symbols.  These
# are not considered opening/closing by Perl for now.

# The rest of the data are at __DATA__  in this file.

my @data = <DATA>;

foreach my $charset (get_supported_code_pages()) {
    print $out_fh "\n" . get_conditional_compile_line_start($charset);

    my @a2n = @{get_a2n($charset)};

    for ( @data ) {
        chomp;

        # Convert any '#' comments to /* ... */; empty lines and comments are
        # output as blank lines
        if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
            my $comment_body = $1 // "";
            if ($comment_body ne "") {
                print $out_fh "/* $comment_body */\n";
            }
            else {
                print $out_fh "\n";
            }
            next;
        }

        unless ($_ =~ m/ ^ ( [^\ ]* )           # Name or code point token
                        (?: [\ ]+ ( [^ ]* ) )?  # optional flag
                        (?: [\ ]+ ( .* ) )?  # name if unnamed; flag is required
                    /x)
        {
            die "Unexpected syntax at line $.: $_\n";
        }

        my $name_or_cp = $1;
        my $flag = $2;
        my $desired_name = $3;

        my $name;
        my $cp;
        my $U_cp;   # code point in Unicode (not-native) terms

        if ($name_or_cp =~ /^U\+(.*)/) {
            $U_cp = hex $1;
            $name = charnames::viacode($name_or_cp);
            if (! defined $name) {
                next if $flag =~ /skip_if_undef/;
                die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
                $name = "";
            }
        }
        else {
            $name = $name_or_cp;
            die "Unknown name '$name' at line $.: $_\n" unless defined $name;
            $U_cp = charnames::vianame($name =~ s/_/ /gr);
        }

        $cp = ($U_cp < 256)
            ? $a2n[$U_cp]
            : $U_cp;

        $name = $desired_name if $name eq "" && $desired_name;
        $name =~ s/[- ]/_/g;   # The macro name can have no blanks nor dashes

        my $str;
        my $suffix;
        if (defined $flag && $flag eq 'native') {
            die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
            $suffix = '_NATIVE';
            $str = sprintf "0x%02X", $cp;        # Is a numeric constant
        }
        else {
            $str = backslash_x_form($U_cp, $charset);

            $suffix = '_UTF8';
            if (! defined $flag || $flag =~ /^ string (_skip_if_undef)? $/x) {
                $str = "\"$str\"";  # Will be a string constant
            } elsif ($flag eq 'tail') {
                    $str =~ s/\\x..//;  # Remove the first byte
                    $suffix .= '_TAIL';
                    $str = "\"$str\"";  # Will be a string constant
            }
            elsif ($flag eq 'first') {
                $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
                $suffix .= '_FIRST_BYTE';
                $str = "0x$str";        # Is a numeric constant
            }
            else {
                die "Unknown flag at line $.: $_\n";
            }
        }
        printf $out_fh "#   define %s%s  %s    /* U+%04X */\n", $name, $suffix, $str, $U_cp;
    }

    # Now output the strings of opening/closing delimiters.  The Unicode
    # values were earlier entered into %paireds
    my $utf8_opening = "";
    my $utf8_closing = "";
    my $non_utf8_opening = "";
    my $non_utf8_closing = "";
    my $deprecated_if_not_mirrored = "";
    my $non_utf8_deprecated_if_not_mirrored = "";

    for my $from (sort { $a <=> $b } keys %paireds) {
        my $to = $paireds{$from};
        my $utf8_from_backslashed = backslash_x_form($from, $charset);
        my $utf8_to_backslashed   = backslash_x_form($to, $charset);
        my $non_utf8_from_backslashed;
        my $non_utf8_to_backslashed;

        $utf8_opening .= $utf8_from_backslashed;
        $utf8_closing .= $utf8_to_backslashed;

        if ($from < 256) {
            $non_utf8_from_backslashed =
                                  backslash_x_form($from, $charset, 'not_utf8');
            $non_utf8_to_backslashed =
                                  backslash_x_form($to, $charset, 'not_utf8');

            $non_utf8_opening .= $non_utf8_from_backslashed;
            $non_utf8_closing .= $non_utf8_to_backslashed;
        }

        # Only the ASCII range paired delimiters have traditionally been
        # accepted.  Until the feature is considered standard, the non-ASCII
        # opening ones must be deprecated when the feature isn't in effect, so
        # as to warn about behavior that is planned to change.
        if ($from > 127) {
            $deprecated_if_not_mirrored .= $utf8_from_backslashed;
            $non_utf8_deprecated_if_not_mirrored .=
                                    $non_utf8_from_backslashed if $from < 256;

            # We deprecate using any of these strongly directional characters
            # at either end of the string, in part so we could allow them to
            # be reversed.
            $deprecated_if_not_mirrored .= $utf8_to_backslashed
                                       if index ($deprecated_if_not_mirrored,
                                                 $utf8_to_backslashed) < 0;
        }

        # The implementing code in toke.c assumes that the byte length of each
        # opening delimiter is the same as its mirrored closing one.  This
        # makes sure of that by checking upon each iteration of the loop.
        if (length $utf8_opening != length $utf8_closing) {
            die "Byte length of representation of '"
              .  charnames::viacode($from)
              . " differs from its mapping '"
              .  charnames::viacode($to)
              .  "'";
        }

        print STDERR format_pairs_line($from, $to) if $output_lists;
    }
    $output_lists = 0;  # Only output in first iteration

    print $out_fh <<~"EOT";

        #   ifdef PERL_IN_TOKE_C
               /* Paired characters for quote-like operators, in UTF-8 */
        #      define EXTRA_OPENING_UTF8_BRACKETS "$utf8_opening"
        #      define EXTRA_CLOSING_UTF8_BRACKETS "$utf8_closing"

               /* And not in UTF-8 */
        #      define EXTRA_OPENING_NON_UTF8_BRACKETS "$non_utf8_opening"
        #      define EXTRA_CLOSING_NON_UTF8_BRACKETS "$non_utf8_closing"

               /* And what's deprecated */
        #      define DEPRECATED_OPENING_UTF8_BRACKETS "$deprecated_if_not_mirrored"
        #      define DEPRECATED_OPENING_NON_UTF8_BRACKETS "$non_utf8_deprecated_if_not_mirrored"
        #   endif
        EOT

    my $max_PRINT_A = 0;
    for my $i (0x20 .. 0x7E) {
        $max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
    }
    $max_PRINT_A = sprintf "0x%02X", $max_PRINT_A;
    print $out_fh <<"EOT";

#   ifdef PERL_IN_REGCOMP_C
#     define MAX_PRINT_A  $max_PRINT_A   /* The max code point that isPRINT_A */
#   endif
EOT

    print $out_fh get_conditional_compile_line_end();

}

my $count = 0;
my @other_invlist = prop_invlist("Other");
for (my $i = 0; $i < @other_invlist; $i += 2) {
    $count += ((defined $other_invlist[$i+1])
              ? $other_invlist[$i+1]
              : 0x110000)
              - $other_invlist[$i];
}
$count = 0x110000 - $count;
print $out_fh <<~"EOT";

    /* The number of code points not matching \\pC */
    #ifdef PERL_IN_REGCOMP_C
    #  define NON_OTHER_COUNT  $count
    #endif
    EOT

# If this release has both the CWCM and CWCF properties, find the highest code
# point which changes under any case change.  We can use this to short-circuit
# code
my @cwcm = prop_invlist('CWCM');
if (@cwcm) {
    my @cwcf = prop_invlist('CWCF');
    if (@cwcf) {
        my $max = ($cwcm[-1] < $cwcf[-1])
                  ? $cwcf[-1]
                  : $cwcm[-1];
        $max = sprintf "0x%X", $max - 1;
        print $out_fh <<~"EOS";

            /* The highest code point that has any type of case change */
            #ifdef PERL_IN_UTF8_C
            #  define HIGHEST_CASE_CHANGING_CP  $max
            #endif
            EOS
    }
}

print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n";

read_only_bottom_close_and_rename($out_fh);

# DATA FORMAT
#
# Note that any apidoc comments you want in the file need to be added to one
# of the prints above
#
# A blank line is output as-is.
# Comments (lines whose first non-blank is a '#') are converted to C-style,
# though empty comments are converted to blank lines.  Otherwise, each line
# represents one #define, and begins with either a Unicode character name with
# the blanks and dashes in it squeezed out or replaced by underscores; or it
# may be a hexadecimal Unicode code point of the form U+xxxx.  In the latter
# case, the name will be looked-up to use as the name of the macro.  In either
# case, the macro name will have suffixes as listed above, and all blanks and
# dashes will be replaced by underscores.
#
# Each line may optionally have one of the following flags on it, separated by
# white space from the initial token.
#   string  indicates that the output is to be of the string form
#           described in the comments above that are placed in the file.
#   string_skip_ifundef  is the same as 'string', but instead of dying if the
#           code point doesn't exist, the line is just skipped: no output is
#           generated for it
#   first   indicates that the output is to be of the FIRST_BYTE form.
#   tail    indicates that the output is of the _TAIL form.
#   native  indicates that the output is the code point, converted to the
#           platform's native character set if applicable
#
# If the code point has no official name, the desired name may be appended
# after the flag, which will be ignored if there is an official name.
#
# This program is used to make it convenient to create compile time constants
# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
# having to figure things out.

__DATA__
U+017F string

U+0300 string
U+0307 string

U+1E9E string_skip_if_undef

U+FB05 string
U+FB06 string
U+0130 string
U+0131 string

U+2010 string
BOM first
BOM tail

BOM string

U+FFFD string

U+10FFFF string MAX_UNICODE

NBSP native
NBSP string

DEL native
CR  native
LF  native
VT  native
ESC native
U+00DF native
U+00DF string
U+00E5 native
U+00C5 native
U+00FF native
U+00B5 native
U+00B5 string
Commit	Line	Data
61dad979 KW	1	use v5.16.0;
	2	use strict;
	3	use warnings;
c7b32e72	4	no warnings 'experimental::regex_sets';
3d7c117d MB	5	require './regen/regen_lib.pl';
3d7c117d MB	6	require './regen/charset_translations.pl';
c7b32e72	7	use Unicode::UCD qw(prop_invlist prop_invmap);
61dad979	8	use charnames qw(:loose);
dce1e563 KW	9	binmode(STDERR, ":utf8");
	10
	11	# Set this to 1 temporarily to get on stderr the complete list of paired
	12	# string delimiters this generates. This list is suitable for plugging into a
	13	# pod.
	14	my $output_lists = 0;
61dad979	15
1b0f46bf	16	my $out_fh = open_new('unicode_constants.h', '>',
ad88cddb	17	{style => '*', by => $0,
61dad979 KW	18	from => "Unicode data"});
	19
	20	print $out_fh <<END;
d10c72f2	21
6a5bc5ac KW	22	#ifndef PERL_UNICODE_CONSTANTS_H_ /* Guard against nested #includes */
6a5bc5ac KW	23	#define PERL_UNICODE_CONSTANTS_H_ 1
d10c72f2	24
4b4853d1 KW	25	/* This file contains #defines for the version of Unicode being used and
	26	* various Unicode code points. The values the code point macros expand to
	27	* are the native Unicode code point, or all or portions of the UTF-8 encoding
	28	* for the code point. In the former case, the macro name has the suffix
	29	* "_NATIVE"; otherwise, the suffix "_UTF8".
61dad979	30	*
525b6419 KW	31	* The macros that have the suffix "_UTF8" may have further suffixes, as
	32	* follows:
	33	* "_FIRST_BYTE" if the value is just the first byte of the UTF-8
	34	* representation; the value will be a numeric constant.
	35	* "_TAIL" if instead it represents all but the first byte. This, and
	36	* with no additional suffix are both string constants */
61dad979	37
69bc4c1f	38	/*
3f620621	39	=for apidoc_section \$unicode
69bc4c1f	40
78342678	41	=for apidoc AmnU\|const char *\|BOM_UTF8
69bc4c1f KW	42
	43	This is a macro that evaluates to a string constant of the UTF-8 bytes that
	44	define the Unicode BYTE ORDER MARK (U+FEFF) for the platform that perl
	45	is compiled on. This allows code to use a mnemonic for this character that
	46	works on both ASCII and EBCDIC platforms.
	47	S<C<sizeof(BOM_UTF8) - 1>> can be used to get its length in
	48	bytes.
	49
78342678	50	=for apidoc AmnU\|const char *\|REPLACEMENT_CHARACTER_UTF8
69bc4c1f KW	51
	52	This is a macro that evaluates to a string constant of the UTF-8 bytes that
	53	define the Unicode REPLACEMENT CHARACTER (U+FFFD) for the platform that perl
	54	is compiled on. This allows code to use a mnemonic for this character that
	55	works on both ASCII and EBCDIC platforms.
	56	S<C<sizeof(REPLACEMENT_CHARACTER_UTF8) - 1>> can be used to get its length in
	57	bytes.
	58
	59	=cut
	60	*/
	61
61dad979 KW	62	END
61dad979 KW	63
63cd44e4 KW	64	sub backslash_x_form($$;$) {
	65	# Output the code point represented by the byte string $bytes as a
	66	# sequence of \x{} constants. $bytes should be the UTF-8 for the code
	67	# point if the final parameter is absent or empty. Otherwise it should be
	68	# the Latin1 code point itself.
	69	#
	70	# The output is translated into the character set '$charset'.
	71
	72	my ($bytes, $charset, $non_utf8) = @_;
	73	if ($non_utf8) {
	74	die "Must be utf8 if above 255" if $bytes > 255;
	75	my $a2n = get_a2n($charset);
	76	return sprintf "\\x%02X", $a2n->[$bytes];
	77	}
	78	else {
	79	return join "", map { sprintf "\\x%02X", ord $_ }
	80	split //, cp_2_utfbytes($bytes, $charset);
	81	}
	82	}
	83
dce1e563 KW	84
	85	my %opposite_of = ( LEFT => 'RIGHT', RIGHT =>'LEFT' );
	86
	87	my $directional_re = qr/\b(LEFT\|RIGHT)\b/; # Make sure to capture $1
	88
	89	sub format_pairs_line($$) {
	90	my ($from, $to) = @_;
	91
	92	# Format a line containing a character pair in preparation
	93	# for output, suitable for pod.
	94
	95	my $lhs_name = charnames::viacode($from);
	96	my $lhs_hex = sprintf "%04X", $from;
	97	my $rhs_name;
	98	my $rhs_hex;
	99	my $name = $lhs_name;
	100
	101	my $hanging_indent = 26;
	102
	103	if (defined $to) {
	104	my $rhs_name = charnames::viacode($to);
	105	$rhs_hex = sprintf "%04X", $to;
	106
	107	# Most of the names differ only in LEFT vs RIGHT; some in
	108	# LESS-THAN vs GREATER-THAN. It takes less space, and is easier to
	109	# understand if they are displayed combined.
	110	if ($name =~ s/$directional_re/$opposite_of{$1}/gr eq $rhs_name) {
	111	$name =~ s,$directional_re,$1/$opposite_of{$1},g;
	112	}
	113	else { # Otherwise, display them sequentially
	114	$name .= ", " . $rhs_name;
	115	}
	116	}
	117
	118	# Handle double-width characters, based on the East Asian Width property.
	119	# Add an extra space to non-wide ones so things stay vertically aligned.
	120	my $extra = 0;
	121	my $output_line = " " # Indent in case output being used for verbatim
	122	# pod
	123	. chr $from;
	124	if (chr($from) =~ /[\p{EA=W}\p{EA=F}]/) {
	125	$extra++; # The length() will be shorter than the displayed
	126	# width
	127	}
	128	else {
	129	$output_line .= " ";
	130	}
	131	if (defined $to) {
	132	$output_line .= " " . chr $to;
	133	if (chr($to) =~ /[\p{EA=W}\p{EA=F}]/) {
	134	$extra++;
	135	}
	136	else {
	137	$output_line .= " ";
	138	}
	139	}
	140	else {
	141	$output_line .= " ";
	142	}
	143
	144	$output_line .= " U+$lhs_hex";
	145	$output_line .= ", U+$rhs_hex" if defined $to;;
	146	my $cur_len = $extra + length $output_line;
	147	$output_line .= " " x ($hanging_indent - $cur_len);
148
149	my $max_len = 74; # Pod formatter will indent 4 spaces
150	$cur_len = length $output_line;
151
152	if ($cur_len + length $name <= $max_len) {
153	$output_line .= $name; # It will fit
154	}
155	else { # It won't fit. Append a segment that is unbreakable until would
156	# exceed the available width; then start on a new line
157	# Doesn't handle the case where the whole segment doesn't fit;
158	# this just doesn't come up with the input data.
159	while ($name =~ / ( .+? ) \b{lb} /xg) {
160	my $segment = $1;
161	my $added_length = length $segment;
162	if ($cur_len + $added_length > $max_len) {
163	$output_line =~ s/ +$//;
164	$output_line .= "\n" . " " x $hanging_indent;
165	$cur_len = $hanging_indent;
166	}
167
168	$output_line .= $segment;
169	$cur_len += $added_length;
170	}
171	}
172
173	return $output_line . "\n";
174	}
175
4b4853d1 KW	176	my $version = Unicode::UCD::UnicodeVersion();
	177	my ($major, $dot, $dotdot) = $version =~ / (.?) \. (.?) (?: \. (.*) )? $ /x;
	178	$dotdot = 0 unless defined $dotdot;
	179
	180	print $out_fh <<END;
	181	#define UNICODE_MAJOR_VERSION $major
	182	#define UNICODE_DOT_VERSION $dot
	183	#define UNICODE_DOT_DOT_VERSION $dotdot
	184
	185	END
	186
c7b32e72 KW	187	# Gather the characters in Unicode that have left/right symmetry suitable for
	188	# paired string delimiters
	189	my %paireds = ( ord '<' => ord '>' ); # We don't normally use math ones, but
	190	# this is traditionally included
	191
	192	# This property is the universe of all characters in Unicode which
	193	# are of some import to the Bidirectional Algorithm, and for which there is
	194	# another Unicode character that is a mirror of it.
	195	my ($bmg_invlist, $bmg_invmap, $format, $bmg_default) =
	196	prop_invmap("Bidi_Mirroring_Glyph");
	197
	198	# The current list of characters that Perl considers to be paired
	199	# opening/closing delimiters is quite conservative, consisting of those
	200	# from the above property that other Unicode properties classify as
	201	# opening/closing.
	202
	203	# Find the ones in the bmg list that Unicode thinks are opening ones.
	204	for (my $i = 0; $i < $bmg_invlist->@*; $i++) {
	205	my $mirror_code_point = $bmg_invmap->[$i];
	206	next if $mirror_code_point eq $bmg_default; # Doesn't map to a character.
	207
	208	my $code_point = $bmg_invlist->[$i];
	209
	210	# Bidi_Paired_Bracket_Type=Open and General_Category=Open_Punctuation are
	211	# definitely in the list. It is language-dependent whether members of
	212	# General_Category=Initial_Punctuation are considered opening or closing;
835f2666	213	# we allow either to be at the front
c7b32e72 KW	214	if (chr($code_point) =~ /(?[ \p{BPT=Open}
	215	\| \p{Gc=Open_Punctuation}
	216	\| \p{Gc=Initial_Punctuation}
	217	])/)
	218	{
	219	$paireds{$code_point} = $mirror_code_point;
	220	}
835f2666 KW	221
	222	if (chr($code_point) =~ /\p{Gc=Initial_Punctuation}/) {
	223	$paireds{$mirror_code_point} = $code_point;
	224	}
c7b32e72 KW	225	}
	226
	227	# There are several hundred characters other characters that clearly should be
	228	# mirrors of each other, like LEFTWARDS ARROW and RIGHTWARDS ARROW. Unicode
	229	# did not bother to classify them as mirrors mostly because they aren't of
	230	# import in the Bidirectional Algorithm. Most of them are symbols. These
	231	# are not considered opening/closing by Perl for now.
	232
	233	# The rest of the data are at __DATA__ in this file.
61dad979	234
ad88cddb KW	235	my @data = <DATA>;
	236
	237	foreach my $charset (get_supported_code_pages()) {
	238	print $out_fh "\n" . get_conditional_compile_line_start($charset);
	239
c30a0cf2	240	my @a2n = @{get_a2n($charset)};
ad88cddb	241
4a4b1311 KW	242	for ( @data ) {
	243	chomp;
	244
	245	# Convert any '#' comments to /* ... */; empty lines and comments are
	246	# output as blank lines
	247	if ($_ =~ m/ ^ \s* (?: \# ( .* ) )? $ /x) {
	248	my $comment_body = $1 // "";
	249	if ($comment_body ne "") {
	250	print $out_fh "/* $comment_body */\n";
	251	}
	252	else {
	253	print $out_fh "\n";
	254	}
	255	next;
5a731a17	256	}
76837d21	257
4a4b1311 KW	258	unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
	259	(?: [\ ]+ ( [^ ]* ) )? # optional flag
	260	(?: [\ ]+ ( .* ) )? # name if unnamed; flag is required
	261	/x)
	262	{
	263	die "Unexpected syntax at line $.: $_\n";
	264	}
61dad979	265
4a4b1311 KW	266	my $name_or_cp = $1;
	267	my $flag = $2;
	268	my $desired_name = $3;
	269
	270	my $name;
	271	my $cp;
	272	my $U_cp; # code point in Unicode (not-native) terms
4a4b1311 KW	273
	274	if ($name_or_cp =~ /^U\+(.*)/) {
	275	$U_cp = hex $1;
	276	$name = charnames::viacode($name_or_cp);
	277	if (! defined $name) {
280ac755 KW	278	next if $flag =~ /skip_if_undef/;
280ac755 KW	279	die "Unknown code point '$name_or_cp' at line $.: $_\n" unless $desired_name;
4a4b1311 KW	280	$name = "";
	281	}
	282	}
	283	else {
	284	$name = $name_or_cp;
	285	die "Unknown name '$name' at line $.: $_\n" unless defined $name;
	286	$U_cp = charnames::vianame($name =~ s/_/ /gr);
632c9f80	287	}
61dad979	288
4a4b1311 KW	289	$cp = ($U_cp < 256)
	290	? $a2n[$U_cp]
	291	: $U_cp;
ad88cddb	292
4a4b1311 KW	293	$name = $desired_name if $name eq "" && $desired_name;
4a4b1311 KW	294	$name =~ s/[- ]/_/g; # The macro name can have no blanks nor dashes
61dad979	295
4a4b1311 KW	296	my $str;
	297	my $suffix;
	298	if (defined $flag && $flag eq 'native') {
	299	die "Are you sure you want to run this on an above-Latin1 code point?" if $cp > 0xff;
	300	$suffix = '_NATIVE';
	301	$str = sprintf "0x%02X", $cp; # Is a numeric constant
81a2a11f KW	302	}
81a2a11f KW	303	else {
63cd44e4	304	$str = backslash_x_form($U_cp, $charset);
4a4b1311 KW	305
	306	$suffix = '_UTF8';
	307	if (! defined $flag \|\| $flag =~ /^ string (_skip_if_undef)? $/x) {
	308	$str = "\"$str\""; # Will be a string constant
	309	} elsif ($flag eq 'tail') {
	310	$str =~ s/\\x..//; # Remove the first byte
	311	$suffix .= '_TAIL';
	312	$str = "\"$str\""; # Will be a string constant
	313	}
	314	elsif ($flag eq 'first') {
	315	$str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
	316	$suffix .= '_FIRST_BYTE';
	317	$str = "0x$str"; # Is a numeric constant
	318	}
	319	else {
	320	die "Unknown flag at line $.: $_\n";
	321	}
81a2a11f	322	}
4a4b1311	323	printf $out_fh "# define %s%s %s /* U+%04X */\n", $name, $suffix, $str, $U_cp;
a1beba5b	324	}
09cc440d	325
c7b32e72 KW	326	# Now output the strings of opening/closing delimiters. The Unicode
	327	# values were earlier entered into %paireds
	328	my $utf8_opening = "";
	329	my $utf8_closing = "";
	330	my $non_utf8_opening = "";
	331	my $non_utf8_closing = "";
	332	my $deprecated_if_not_mirrored = "";
	333	my $non_utf8_deprecated_if_not_mirrored = "";
	334
	335	for my $from (sort { $a <=> $b } keys %paireds) {
	336	my $to = $paireds{$from};
	337	my $utf8_from_backslashed = backslash_x_form($from, $charset);
	338	my $utf8_to_backslashed = backslash_x_form($to, $charset);
	339	my $non_utf8_from_backslashed;
	340	my $non_utf8_to_backslashed;
	341
	342	$utf8_opening .= $utf8_from_backslashed;
	343	$utf8_closing .= $utf8_to_backslashed;
	344
	345	if ($from < 256) {
	346	$non_utf8_from_backslashed =
	347	backslash_x_form($from, $charset, 'not_utf8');
	348	$non_utf8_to_backslashed =
	349	backslash_x_form($to, $charset, 'not_utf8');
	350
	351	$non_utf8_opening .= $non_utf8_from_backslashed;
	352	$non_utf8_closing .= $non_utf8_to_backslashed;
	353	}
	354
	355	# Only the ASCII range paired delimiters have traditionally been
	356	# accepted. Until the feature is considered standard, the non-ASCII
	357	# opening ones must be deprecated when the feature isn't in effect, so
	358	# as to warn about behavior that is planned to change.
	359	if ($from > 127) {
	360	$deprecated_if_not_mirrored .= $utf8_from_backslashed;
	361	$non_utf8_deprecated_if_not_mirrored .=
	362	$non_utf8_from_backslashed if $from < 256;
835f2666 KW	363
	364	# We deprecate using any of these strongly directional characters
	365	# at either end of the string, in part so we could allow them to
	366	# be reversed.
	367	$deprecated_if_not_mirrored .= $utf8_to_backslashed
	368	if index ($deprecated_if_not_mirrored,
	369	$utf8_to_backslashed) < 0;
c7b32e72 KW	370	}
	371
	372	# The implementing code in toke.c assumes that the byte length of each
	373	# opening delimiter is the same as its mirrored closing one. This
	374	# makes sure of that by checking upon each iteration of the loop.
	375	if (length $utf8_opening != length $utf8_closing) {
	376	die "Byte length of representation of '"
	377	. charnames::viacode($from)
	378	. " differs from its mapping '"
	379	. charnames::viacode($to)
	380	. "'";
	381	}
dce1e563 KW	382
dce1e563 KW	383	print STDERR format_pairs_line($from, $to) if $output_lists;
c7b32e72	384	}
dce1e563	385	$output_lists = 0; # Only output in first iteration
c7b32e72 KW	386
	387	print $out_fh <<~"EOT";
	388
	389	# ifdef PERL_IN_TOKE_C
	390	/* Paired characters for quote-like operators, in UTF-8 */
	391	# define EXTRA_OPENING_UTF8_BRACKETS "$utf8_opening"
	392	# define EXTRA_CLOSING_UTF8_BRACKETS "$utf8_closing"
	393
	394	/* And not in UTF-8 */
	395	# define EXTRA_OPENING_NON_UTF8_BRACKETS "$non_utf8_opening"
	396	# define EXTRA_CLOSING_NON_UTF8_BRACKETS "$non_utf8_closing"
	397
	398	/* And what's deprecated */
	399	# define DEPRECATED_OPENING_UTF8_BRACKETS "$deprecated_if_not_mirrored"
	400	# define DEPRECATED_OPENING_NON_UTF8_BRACKETS "$non_utf8_deprecated_if_not_mirrored"
	401	# endif
	402	EOT
	403
09cc440d KW	404	my $max_PRINT_A = 0;
	405	for my $i (0x20 .. 0x7E) {
	406	$max_PRINT_A = $a2n[$i] if $a2n[$i] > $max_PRINT_A;
	407	}
c62fdeb7 KW	408	$max_PRINT_A = sprintf "0x%02X", $max_PRINT_A;
c62fdeb7 KW	409	print $out_fh <<"EOT";
09cc440d	410
e80ffeda KW	411	# ifdef PERL_IN_REGCOMP_C
	412	# define MAX_PRINT_A $max_PRINT_A /* The max code point that isPRINT_A */
	413	# endif
c62fdeb7 KW	414	EOT
	415
	416	print $out_fh get_conditional_compile_line_end();
b35552de KW	417
	418	}
	419
b35552de KW	420	my $count = 0;
	421	my @other_invlist = prop_invlist("Other");
	422	for (my $i = 0; $i < @other_invlist; $i += 2) {
	423	$count += ((defined $other_invlist[$i+1])
	424	? $other_invlist[$i+1]
	425	: 0x110000)
	426	- $other_invlist[$i];
61dad979	427	}
c62fdeb7 KW	428	$count = 0x110000 - $count;
	429	print $out_fh <<~"EOT";
	430
	431	/* The number of code points not matching \\pC */
	432	#ifdef PERL_IN_REGCOMP_C
	433	# define NON_OTHER_COUNT $count
	434	#endif
	435	EOT
61dad979	436
3bfc1e70 KW	437	# If this release has both the CWCM and CWCF properties, find the highest code
	438	# point which changes under any case change. We can use this to short-circuit
	439	# code
	440	my @cwcm = prop_invlist('CWCM');
	441	if (@cwcm) {
	442	my @cwcf = prop_invlist('CWCF');
	443	if (@cwcf) {
	444	my $max = ($cwcm[-1] < $cwcf[-1])
	445	? $cwcf[-1]
	446	: $cwcm[-1];
c62fdeb7 KW	447	$max = sprintf "0x%X", $max - 1;
	448	print $out_fh <<~"EOS";
	449
	450	/* The highest code point that has any type of case change */
	451	#ifdef PERL_IN_UTF8_C
	452	# define HIGHEST_CASE_CHANGING_CP $max
	453	#endif
	454	EOS
3bfc1e70 KW	455	}
	456	}
	457
6a5bc5ac	458	print $out_fh "\n#endif /* PERL_UNICODE_CONSTANTS_H_ */\n";
d10c72f2	459
61dad979 KW	460	read_only_bottom_close_and_rename($out_fh);
61dad979 KW	461
9d8e3074 KW	462	# DATA FORMAT
9d8e3074 KW	463	#
69bc4c1f KW	464	# Note that any apidoc comments you want in the file need to be added to one
	465	# of the prints above
	466	#
9d8e3074 KW	467	# A blank line is output as-is.
	468	# Comments (lines whose first non-blank is a '#') are converted to C-style,
	469	# though empty comments are converted to blank lines. Otherwise, each line
	470	# represents one #define, and begins with either a Unicode character name with
	471	# the blanks and dashes in it squeezed out or replaced by underscores; or it
	472	# may be a hexadecimal Unicode code point of the form U+xxxx. In the latter
	473	# case, the name will be looked-up to use as the name of the macro. In either
	474	# case, the macro name will have suffixes as listed above, and all blanks and
	475	# dashes will be replaced by underscores.
	476	#
	477	# Each line may optionally have one of the following flags on it, separated by
	478	# white space from the initial token.
	479	# string indicates that the output is to be of the string form
	480	# described in the comments above that are placed in the file.
	481	# string_skip_ifundef is the same as 'string', but instead of dying if the
	482	# code point doesn't exist, the line is just skipped: no output is
	483	# generated for it
	484	# first indicates that the output is to be of the FIRST_BYTE form.
	485	# tail indicates that the output is of the _TAIL form.
	486	# native indicates that the output is the code point, converted to the
	487	# platform's native character set if applicable
	488	#
	489	# If the code point has no official name, the desired name may be appended
	490	# after the flag, which will be ignored if there is an official name.
	491	#
	492	# This program is used to make it convenient to create compile time constants
	493	# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
	494	# having to figure things out.
	495
61dad979	496	__DATA__
f2e06375	497	U+017F string
76837d21	498
1dfa4f52	499	U+0300 string
2a614cdc	500	U+0307 string
a78bc3c6	501
8f57fa7d	502	U+1E9E string_skip_if_undef
f2e06375	503
a9f50d33 KW	504	U+FB05 string
a9f50d33 KW	505	U+FB06 string
a0ffb25e KW	506	U+0130 string
a0ffb25e KW	507	U+0131 string
a9f50d33	508
1dfa4f52	509	U+2010 string
5f0aa340 KW	510	BOM first
5f0aa340 KW	511	BOM tail
525b6419	512
69bc4c1f KW	513	BOM string
	514
	515	U+FFFD string
	516
566efd88 KW	517	U+10FFFF string MAX_UNICODE
566efd88 KW	518
df758df2 KW	519	NBSP native
	520	NBSP string
	521
05016631	522	DEL native
c5eda08a KW	523	CR native
c5eda08a KW	524	LF native
d804860b KW	525	VT native
d804860b KW	526	ESC native
1dfa4f52	527	U+00DF native
69ffc8e3	528	U+00DF string
1dfa4f52 KW	529	U+00E5 native
	530	U+00C5 native
	531	U+00FF native
	532	U+00B5 native
69ffc8e3	533	U+00B5 string