This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
/regcharclass.pl, utf8_strings.pl: Add guard to .h
[perl5.git] / regen / utf8_strings.pl
CommitLineData
61dad979
KW
1use v5.16.0;
2use strict;
3use warnings;
4require 'regen/regen_lib.pl';
5use charnames qw(:loose);
6
7my $out_fh = open_new('utf8_strings.h', '>',
8 {style => '*', by => $0,
9 from => "Unicode data"});
10
11print $out_fh <<END;
d10c72f2
KW
12
13#ifndef H_UTF8_STRINGS /* Guard against nested #includes */
14#define H_UTF8_STRINGS 1
15
61dad979
KW
16/* This file contains #defines for various Unicode code points. The values
17 * for the macros are all or portions of the UTF-8 encoding for the code
18 * point. Note that the names all have the suffix "_UTF8".
19 *
20 * The suffix "_FIRST_BYTE" may be appended to the name if the value is just
21 * the first byte of the UTF-8 representation; the value will be a numeric
22 * constant.
23 *
24 * The suffix "_TAIL" is appened if instead it represents all but the first
25 * byte. This, and with no suffix are both string constants */
26
27END
28
29# The data are at the end of this file. Each line represents one #define.
30# Each line begins with either a Unicode character name with the blanks in it
31# squeezed out or replaced by underscores; or it may be a hexadecimal code
32# point. In the latter case, the name will be looked-up to use as the name
33# of the macro. In either case, the macro name will have suffixes as
34# listed above, and all blanks will be replaced by underscores.
35#
36# Each line may optionally have one of the following flags on it, separated by
37# white space from the initial token.
38# first indicates that the output is to be of the FIRST_BYTE form
39# described in the comments above that are placed in the file.
40# tail indicates that the output is of the _TAIL form.
41#
42# This program is used to make it convenient to create compile time constants
43# of UTF-8, and to generate proper EBCDIC as well as ASCII without manually
44# having to figure things out.
45
46while ( <DATA> ) {
47 chomp;
48 unless ($_ =~ m/ ^ ( [^\ ]* ) # Name or code point token
49 (?: [\ ]+ ( .* ) )? # optional flag
50 /x)
51 {
52 die "Unexpected syntax at line $.: $_\n";
53 }
54
55 my $name_or_cp = $1;
56 my $flag = $2;
57
58 my $name;
59 my $cp;
60
61 if ($name_or_cp =~ /[^[:xdigit:]]/) {
62
63 # Anything that isn't a hex value must be a name.
64 $name = $name_or_cp;
65 $cp = charnames::vianame($name =~ s/_/ /gr);
66 die "Unknown name '$name' at line $.: $_\n" unless defined $name;
67 }
68 else {
69 $cp = $name_or_cp;
70 $name = charnames::viacode("0$cp"); # viacode requires a leading zero
71 # to be sure that the argument is hex
72 die "Unknown code point '$cp' at line $.: $_\n" unless defined $cp;
73 }
74
75 $name =~ s/ /_/g; # The macro name can have no blanks in it
76
77 my $str = join "", map { sprintf "\\x%02X", $_ }
78 unpack("U0C*", pack("U", hex $cp));
79
80 my $suffix = '_UTF8';
81 if (! defined $flag) {
82 $str = "\"$str\""; # Will be a string constant
83 } elsif ($flag eq 'tail') {
84 $str =~ s/\\x..//; # Remove the first byte
85 $suffix .= '_TAIL';
86 $str = "\"$str\""; # Will be a string constant
87 }
88 elsif ($flag eq 'first') {
89 $str =~ s/ \\x ( .. ) .* /$1/x; # Get the two nibbles of the 1st byte
90 $suffix .= '_FIRST_BYTE';
91 $str = "0x$str"; # Is a numeric constant
92 }
93 else {
94 die "Unknown flag at line $.: $_\n";
95 }
96 print $out_fh "#define ${name}$suffix $str /* U+$cp */\n";
97}
98
d10c72f2
KW
99print $out_fh "\n#endif /* H_UTF8_STRINGS */\n";
100
61dad979
KW
101read_only_bottom_close_and_rename($out_fh);
102
103__DATA__
1040300
1050301
1060308
10703B9 tail
10803C5 tail
10903B9 first
11003C5 first
1111100
1121160
11311A8
1142010