This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/utf8_strings.pl: Allow explicit default on input
[perl5.git] / regen / utf8_strings.pl
CommitLineData
61dad979
KW
1use v5.16.0;
2use strict;
3use warnings;
4require 'regen/regen_lib.pl';
5use charnames qw(:loose);
6
7my $out_fh = open_new('utf8_strings.h', '>',
8 {style => '*', by => $0,
9 from => "Unicode data"});
10
11print $out_fh <<END;
d10c72f2
KW
12
13#ifndef H_UTF8_STRINGS /* Guard against nested #includes */
14#define H_UTF8_STRINGS 1
15
61dad979
KW
16/* This file contains #defines for various Unicode code points. The values
17 * for the macros are all or portions of the UTF-8 encoding for the code
18 * point. Note that the names all have the suffix "_UTF8".
19 *
20 * The suffix "_FIRST_BYTE" may be appended to the name if the value is just
21 * the first byte of the UTF-8 representation; the value will be a numeric
22 * constant.
23 *
24 * The suffix "_TAIL" is appened if instead it represents all but the first
25 * byte. This, and with no suffix are both string constants */
26
27END
28
76837d21
KW
29# The data are at the end of this file. A blank line is output as-is.
30# Otherwise, each line represents one #define, and begins with either a
31# Unicode character name with the blanks in it squeezed out or replaced by
32# underscores; or it may be a hexadecimal Unicode code point. In the latter
33# case, the name will be looked-up to use as the name of the macro. In either
34# case, the macro name will have suffixes as listed above, and all blanks will
35# be replaced by underscores.
61dad979
KW
36#
37# Each line may optionally have one of the following flags on it, separated by
38# white space from the initial token.
5f1720e9 39# string indicates that the output is to be of the string form
61dad979 40# described in the comments above that are placed in the file.
5f1720e9 41# first indicates that the output is to be of the FIRST_BYTE form.
61dad979
KW
42# tail indicates that the output is of the _TAIL form.
43#
44# This program is used to make it convenient to create compile time constants
45# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
46# having to figure things out.
47
48while ( <DATA> ) {
76837d21
KW
49 if ($_ !~ /\S/) {
50 print $out_fh "\n";
51 next;
52 }
53
61dad979
KW
54 chomp;
55 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
56 (?: [\ ]+ ( .* ) )? # optional flag
57 /x)
58 {
59 die "Unexpected syntax at line $.: $_\n";
60 }
61
62 my $name_or_cp = $1;
63 my $flag = $2;
64
65 my $name;
66 my $cp;
67
68 if ($name_or_cp =~ /[^[:xdigit:]]/) {
69
70 # Anything that isn't a hex value must be a name.
71 $name = $name_or_cp;
72 $cp = charnames::vianame($name =~ s/_/ /gr);
73 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
74 }
75 else {
76 $cp = $name_or_cp;
77 $name = charnames::viacode("0$cp"); # viacode requires a leading zero
78 # to be sure that the argument is hex
79 die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp;
80 }
81
82 $name =~ s/ /_/g; # The macro name can have no blanks in it
83
84 my $str = join "", map { sprintf "\\x%02X", $_ }
85 unpack("U0C*", pack("U", hex $cp));
86
87 my $suffix = '_UTF8';
5f1720e9 88 if (! defined $flag || $flag eq 'string') {
61dad979
KW
89 $str = "\"$str\""; # Will be a string constant
90 } elsif ($flag eq 'tail') {
91 $str =~ s/\\x..//; # Remove the first byte
92 $suffix .= '_TAIL';
93 $str = "\"$str\""; # Will be a string constant
94 }
95 elsif ($flag eq 'first') {
96 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
97 $suffix .= '_FIRST_BYTE';
98 $str = "0x$str"; # Is a numeric constant
99 }
100 else {
101 die "Unknown flag at line $.: $_\n";
102 }
103 print $out_fh "#define ${name}$suffix $str /* U+$cp */\n";
104}
105
d10c72f2
KW
106print $out_fh "\n#endif /* H_UTF8_STRINGS */\n";
107
61dad979
KW
108read_only_bottom_close_and_rename($out_fh);
109
110__DATA__
5f1720e9
KW
1110300 string
1120301 string
1130308 string
76837d21 114
61dad979 11503B9 first
76837d21
KW
11603B9 tail
117
61dad979 11803C5 first
76837d21
KW
11903C5 tail
120
61dad979
KW
1211100
1221160
12311A8
5f1720e9 1242010 string