This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/utf8_strings.pl: Copy empty input lines to output
[perl5.git] / regen / utf8_strings.pl
... / ...
CommitLineData
1use v5.16.0;
2use strict;
3use warnings;
4require 'regen/regen_lib.pl';
5use charnames qw(:loose);
6
7my $out_fh = open_new('utf8_strings.h', '>',
8 {style => '*', by => $0,
9 from => "Unicode data"});
10
11print $out_fh <<END;
12
13#ifndef H_UTF8_STRINGS /* Guard against nested #includes */
14#define H_UTF8_STRINGS 1
15
16/* This file contains #defines for various Unicode code points. The values
17 * for the macros are all or portions of the UTF-8 encoding for the code
18 * point. Note that the names all have the suffix "_UTF8".
19 *
20 * The suffix "_FIRST_BYTE" may be appended to the name if the value is just
21 * the first byte of the UTF-8 representation; the value will be a numeric
22 * constant.
23 *
24 * The suffix "_TAIL" is appened if instead it represents all but the first
25 * byte. This, and with no suffix are both string constants */
26
27END
28
29# The data are at the end of this file. A blank line is output as-is.
30# Otherwise, each line represents one #define, and begins with either a
31# Unicode character name with the blanks in it squeezed out or replaced by
32# underscores; or it may be a hexadecimal Unicode code point. In the latter
33# case, the name will be looked-up to use as the name of the macro. In either
34# case, the macro name will have suffixes as listed above, and all blanks will
35# be replaced by underscores.
36#
37# Each line may optionally have one of the following flags on it, separated by
38# white space from the initial token.
39# first indicates that the output is to be of the FIRST_BYTE form
40# described in the comments above that are placed in the file.
41# tail indicates that the output is of the _TAIL form.
42#
43# This program is used to make it convenient to create compile time constants
44# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
45# having to figure things out.
46
47while ( <DATA> ) {
48 if ($_ !~ /\S/) {
49 print $out_fh "\n";
50 next;
51 }
52
53 chomp;
54 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
55 (?: [\ ]+ ( .* ) )? # optional flag
56 /x)
57 {
58 die "Unexpected syntax at line $.: $_\n";
59 }
60
61 my $name_or_cp = $1;
62 my $flag = $2;
63
64 my $name;
65 my $cp;
66
67 if ($name_or_cp =~ /[^[:xdigit:]]/) {
68
69 # Anything that isn't a hex value must be a name.
70 $name = $name_or_cp;
71 $cp = charnames::vianame($name =~ s/_/ /gr);
72 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
73 }
74 else {
75 $cp = $name_or_cp;
76 $name = charnames::viacode("0$cp"); # viacode requires a leading zero
77 # to be sure that the argument is hex
78 die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp;
79 }
80
81 $name =~ s/ /_/g; # The macro name can have no blanks in it
82
83 my $str = join "", map { sprintf "\\x%02X", $_ }
84 unpack("U0C*", pack("U", hex $cp));
85
86 my $suffix = '_UTF8';
87 if (! defined $flag) {
88 $str = "\"$str\""; # Will be a string constant
89 } elsif ($flag eq 'tail') {
90 $str =~ s/\\x..//; # Remove the first byte
91 $suffix .= '_TAIL';
92 $str = "\"$str\""; # Will be a string constant
93 }
94 elsif ($flag eq 'first') {
95 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
96 $suffix .= '_FIRST_BYTE';
97 $str = "0x$str"; # Is a numeric constant
98 }
99 else {
100 die "Unknown flag at line $.: $_\n";
101 }
102 print $out_fh "#define ${name}$suffix $str /* U+$cp */\n";
103}
104
105print $out_fh "\n#endif /* H_UTF8_STRINGS */\n";
106
107read_only_bottom_close_and_rename($out_fh);
108
109__DATA__
1100300
1110301
1120308
113
11403B9 first
11503B9 tail
116
11703C5 first
11803C5 tail
119
1201100
1211160
12211A8
1232010