| 1 | #!perl |
| 2 | # |
| 3 | # This auxiliary script makes five header files |
| 4 | # used for building XSUB of Unicode::Collate. |
| 5 | # |
| 6 | # Usage: |
| 7 | # <do 'mkheader'> in perl, or <perl mkheader> in command line |
| 8 | # |
| 9 | # Input file: |
| 10 | # Collate/allkeys.txt |
| 11 | # |
| 12 | # Output file: |
| 13 | # ucatbl.h |
| 14 | # |
| 15 | use 5.006; |
| 16 | use strict; |
| 17 | use warnings; |
| 18 | use Carp; |
| 19 | use File::Spec; |
| 20 | |
| 21 | BEGIN { |
| 22 | unless ("A" eq pack('U', 0x41)) { |
| 23 | die "Unicode::Collate cannot stringify a Unicode code point\n"; |
| 24 | } |
| 25 | unless (0x41 == unpack('U', 'A')) { |
| 26 | die "Unicode::Collate cannot get a Unicode code point\n"; |
| 27 | } |
| 28 | } |
| 29 | |
| 30 | use constant TRUE => 1; |
| 31 | use constant FALSE => ""; |
| 32 | use constant VCE_TEMPLATE => 'Cn4'; |
| 33 | |
| 34 | sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g } |
| 35 | |
| 36 | our $PACKAGE = 'Unicode::Collate, mkheader'; |
| 37 | our $prefix = "UCA_"; |
| 38 | |
| 39 | our %SimpleEntries; # $codepoint => $keys |
| 40 | our @Rest; |
| 41 | |
| 42 | { |
| 43 | my($f, $fh); |
| 44 | foreach my $d (File::Spec->curdir()) { |
| 45 | $f = File::Spec->catfile($d, "Collate", "allkeys.txt"); |
| 46 | last if open($fh, $f); |
| 47 | $f = undef; |
| 48 | } |
| 49 | croak "$PACKAGE: Collate/allkeys.txt is not found" if !defined $f; |
| 50 | |
| 51 | while (my $line = <$fh>) { |
| 52 | next if $line =~ /^\s*#/; |
| 53 | if ($line =~ /^\s*\@/) { |
| 54 | push @Rest, $line; |
| 55 | next; |
| 56 | } |
| 57 | |
| 58 | next if $line !~ /^\s*[0-9A-Fa-f]/; # lines without element |
| 59 | |
| 60 | $line =~ s/[#%]\s*(.*)//; # removing comment (not getting the name) |
| 61 | |
| 62 | # gets element |
| 63 | my($e, $k) = split /;/, $line; |
| 64 | |
| 65 | croak "Wrong Entry: <charList> must be separated by ';' ". |
| 66 | "from <collElement>" if ! $k; |
| 67 | |
| 68 | my @uv = _getHexArray($e); |
| 69 | next if !@uv; |
| 70 | |
| 71 | if (@uv != 1) { |
| 72 | push @Rest, $line; |
| 73 | next; |
| 74 | # Contractions of two or more characters will not be compiled. |
| 75 | } |
| 76 | |
| 77 | my $is_L3_ignorable = TRUE; |
| 78 | |
| 79 | my @key; |
| 80 | foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed |
| 81 | my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. |
| 82 | my @wt = _getHexArray($arr); |
| 83 | push @key, pack(VCE_TEMPLATE, $var, @wt); |
| 84 | $is_L3_ignorable = FALSE |
| 85 | if $wt[0] || $wt[1] || $wt[2]; |
| 86 | # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable |
| 87 | # is completely ignorable. |
| 88 | # For expansion, an entry $is_L3_ignorable |
| 89 | # if and only if "all" CEs are [.0000.0000.0000]. |
| 90 | } |
| 91 | my $mapping = $is_L3_ignorable ? [] : \@key; |
| 92 | my $num = @$mapping; |
| 93 | my $str = chr($num).join('', @$mapping); |
| 94 | $SimpleEntries{$uv[0]} = stringify($str); |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | sub stringify { |
| 99 | my $str = shift; |
| 100 | return sprintf '"%s"', join '', |
| 101 | map sprintf("\\x%02x", ord $_), split //, $str; |
| 102 | |
| 103 | } |
| 104 | |
| 105 | ########## writing header files ########## |
| 106 | |
| 107 | my $init = ''; |
| 108 | { |
| 109 | my $type = "char* const"; |
| 110 | my $head = $prefix."rest"; |
| 111 | |
| 112 | $init .= "static const $type $head [] = {\n"; |
| 113 | for my $line (@Rest) { |
| 114 | $line =~ s/\s*\z//; |
| 115 | next if $line eq ''; |
| 116 | $init .= "/*$line*/\n" if $line =~ /^[A-Za-z0-9_.:;@\ \[\]]+\z/; |
| 117 | $init .= stringify($line).",\n"; |
| 118 | } |
| 119 | $init .= "NULL\n"; # sentinel |
| 120 | $init .= "};\n\n"; |
| 121 | } |
| 122 | |
| 123 | my @tripletable = ( |
| 124 | { |
| 125 | file => "ucatbl", |
| 126 | name => "simple", |
| 127 | type => "char* const", |
| 128 | hash => \%SimpleEntries, |
| 129 | null => "NULL", |
| 130 | init => $init, |
| 131 | }, |
| 132 | ); |
| 133 | |
| 134 | foreach my $tbl (@tripletable) { |
| 135 | my $file = "$tbl->{file}.h"; |
| 136 | my $head = "${prefix}$tbl->{name}"; |
| 137 | my $type = $tbl->{type}; |
| 138 | my $hash = $tbl->{hash}; |
| 139 | my $null = $tbl->{null}; |
| 140 | my $init = $tbl->{init}; |
| 141 | |
| 142 | open my $fh_h, ">$file" or croak "$PACKAGE: $file can't be made"; |
| 143 | binmode $fh_h; select $fh_h; |
| 144 | my %val; |
| 145 | |
| 146 | print << 'EOF'; |
| 147 | /* |
| 148 | * This file is auto-generated by mkheader. |
| 149 | * Any changes here will be lost! |
| 150 | */ |
| 151 | EOF |
| 152 | |
| 153 | print $init if defined $init; |
| 154 | |
| 155 | foreach my $uv (keys %$hash) { |
| 156 | croak sprintf("a Unicode code point 0x%04X over 0x10FFFF.", $uv) |
| 157 | unless $uv <= 0x10FFFF; |
| 158 | my @c = unpack 'CCCC', pack 'N', $uv; |
| 159 | $val{ $c[1] }{ $c[2] }{ $c[3] } = $hash->{$uv}; |
| 160 | # $c[0] must be 0. |
| 161 | } |
| 162 | |
| 163 | foreach my $p (sort { $a <=> $b } keys %val) { |
| 164 | next if ! $val{ $p }; |
| 165 | for (my $r = 0; $r < 256; $r++) { |
| 166 | next if ! $val{ $p }{ $r }; |
| 167 | printf "static const $type ${head}_%02x_%02x [256] = {\n", $p, $r; |
| 168 | for (my $c = 0; $c < 256; $c++) { |
| 169 | print "\t", defined $val{$p}{$r}{$c} |
| 170 | ? $val{$p}{$r}{$c} |
| 171 | : $null; |
| 172 | print ',' if $c != 255; |
| 173 | print "\n" if $c % 8 == 7; |
| 174 | } |
| 175 | print "};\n\n"; |
| 176 | } |
| 177 | } |
| 178 | foreach my $p (sort { $a <=> $b } keys %val) { |
| 179 | next if ! $val{ $p }; |
| 180 | printf "static const $type* const ${head}_%02x [256] = {\n", $p; |
| 181 | for (my $r = 0; $r < 256; $r++) { |
| 182 | print $val{ $p }{ $r } |
| 183 | ? sprintf("${head}_%02x_%02x", $p, $r) |
| 184 | : "NULL"; |
| 185 | print ',' if $r != 255; |
| 186 | print "\n" if $val{ $p }{ $r } || ($r+1) % 8 == 0; |
| 187 | } |
| 188 | print "};\n\n"; |
| 189 | } |
| 190 | print "static const $type* const * const $head [] = {\n"; |
| 191 | for (my $p = 0; $p <= 0x10; $p++) { |
| 192 | print $val{ $p } ? sprintf("${head}_%02x", $p) : "NULL"; |
| 193 | print ',' if $p != 0x10; |
| 194 | print "\n"; |
| 195 | } |
| 196 | print "};\n\n"; |
| 197 | close $fh_h; |
| 198 | } |
| 199 | |
| 200 | 1; |
| 201 | __END__ |