Commit | Line | Data |
---|---|---|
a0ed51b3 LW |
1 | #!../../miniperl |
2 | ||
505afebf | 3 | $UnicodeData = "Unicode.300"; |
11695a73 | 4 | |
a0ed51b3 LW |
5 | # Note: we try to keep filenames unique within first 8 chars. Using |
6 | # subdirectories for the following helps. | |
7 | mkdir "In", 0777; | |
8 | mkdir "Is", 0777; | |
9 | mkdir "To", 0777; | |
10 | ||
11 | @todo = ( | |
12 | # typical | |
13 | ||
b8c5462f JH |
14 | ['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''], |
15 | ['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''], | |
16 | ['IsAlpha', '$cat =~ /^L[ulo]/', ''], | |
17 | ['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''], | |
18 | ['IsDigit', '$cat =~ /^Nd$/', ''], | |
19 | ['IsUpper', '$cat =~ /^Lu$/', ''], | |
20 | ['IsLower', '$cat =~ /^Ll$/', ''], | |
21 | ['IsASCII', 'hex $code <= 127', ''], | |
22 | ['IsCntrl', '$cat =~ /^C/', ''], | |
23 | ['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''], | |
24 | ['IsPrint', '$cat =~ /^[^C]/', ''], | |
25 | ['IsPunct', '$cat =~ /^P/', ''], | |
26 | ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''], | |
27 | ['ToUpper', '$up', '$up'], | |
28 | ['ToLower', '$down', '$down'], | |
29 | ['ToTitle', '$title', '$title'], | |
30 | ['ToDigit', '$dec ne ""', '$dec'], | |
a0ed51b3 LW |
31 | |
32 | # Name | |
33 | ||
34 | ['Name', '$name', '$name'], | |
35 | ||
36 | # Category | |
37 | ||
38 | ['Category', '$cat', '$cat'], | |
39 | ||
40 | # Normative | |
41 | ||
42 | ['IsM', '$cat =~ /^M/', ''], # Mark | |
43 | ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing | |
44 | ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining | |
45 | ||
46 | ['IsN', '$cat =~ /^N/', ''], # Number | |
47 | ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit | |
48 | ['IsNo', '$cat eq "No"', ''], # Number, Other | |
49 | ||
50 | ['IsZ', '$cat =~ /^Z/', ''], # Zeparator | |
51 | ['IsZs', '$cat eq "Zs"', ''], # Separator, Space | |
52 | ['IsZl', '$cat eq "Zl"', ''], # Separator, Line | |
53 | ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph | |
54 | ||
55 | ['IsC', '$cat =~ /^C/', ''], # Crazy | |
56 | ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format | |
57 | ['IsCo', '$cat eq "Co"', ''], # Other, Private Use | |
58 | ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned | |
59 | ||
60 | # Informative | |
61 | ||
62 | ['IsL', '$cat =~ /^L/', ''], # Letter | |
63 | ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase | |
64 | ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase | |
65 | ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase | |
66 | ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier | |
67 | ['IsLo', '$cat eq "Lo"', ''], # Letter, Other | |
68 | ||
69 | ['IsP', '$cat =~ /^P/', ''], # Punctuation | |
70 | ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash | |
71 | ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open | |
72 | ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close | |
73 | ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other | |
74 | ||
75 | ['IsS', '$cat =~ /^S/', ''], # Symbol | |
76 | ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math | |
77 | ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency | |
78 | ['IsSo', '$cat eq "So"', ''], # Symbol, Other | |
79 | ||
80 | # Combining class | |
81 | ['CombiningClass', '$comb', '$comb'], | |
82 | ||
83 | # BIDIRECTIONAL PROPERTIES | |
84 | ||
85 | ['Bidirectional', '$bid', '$bid'], | |
86 | ||
87 | # Strong types: | |
88 | ||
89 | ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic, | |
90 | # syllabic, and logographic | |
91 | # characters (e.g., CJK | |
92 | # ideographs) | |
93 | ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew, | |
94 | # and punctuation specific to | |
95 | # those scripts | |
96 | ||
97 | # Weak types: | |
98 | ||
99 | ['IsBidiEN','$bid eq "EN"', ''], # European Number | |
100 | ['IsBidiES','$bid eq "ES"', ''], # European Number Separator | |
101 | ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator | |
102 | ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number | |
103 | ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator | |
104 | ||
105 | # Separators: | |
106 | ||
107 | ['IsBidiB', '$bid eq "B"', ''], # Block Separator | |
108 | ['IsBidiS', '$bid eq "S"', ''], # Segment Separator | |
109 | ||
110 | # Neutrals: | |
111 | ||
112 | ['IsBidiWS','$bid eq "WS"', ''], # Whitespace | |
113 | ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other | |
114 | # characters: punctuation, | |
115 | # symbols | |
116 | ||
117 | # Decomposition | |
118 | ||
119 | ['Decomposition', '$decomp', '$decomp'], | |
120 | ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''], | |
121 | ['IsDecoCompat', '$decomp =~ /^</', ''], | |
122 | ['IsDCfont', '$decomp =~ /^<font>/', ''], | |
123 | ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''], | |
124 | ['IsDCinitial', '$decomp =~ /^<initial>/', ''], | |
125 | ['IsDCinital', '$decomp =~ /^<medial>/', ''], | |
126 | ['IsDCfinal', '$decomp =~ /^<final>/', ''], | |
127 | ['IsDCisolated', '$decomp =~ /^<isolated>/', ''], | |
128 | ['IsDCcircle', '$decomp =~ /^<circle>/', ''], | |
129 | ['IsDCsuper', '$decomp =~ /^<super>/', ''], | |
130 | ['IsDCsub', '$decomp =~ /^<sub>/', ''], | |
131 | ['IsDCvertical', '$decomp =~ /^<vertical>/', ''], | |
132 | ['IsDCwide', '$decomp =~ /^<wide>/', ''], | |
133 | ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''], | |
134 | ['IsDCsmall', '$decomp =~ /^<small>/', ''], | |
135 | ['IsDCsquare', '$decomp =~ /^<square>/', ''], | |
136 | ['IsDCcompat', '$decomp =~ /^<compat>/', ''], | |
137 | ||
138 | # Number | |
139 | ||
140 | ['Number', '$num', '$num'], | |
141 | ||
142 | # Mirrored | |
143 | ||
144 | ['IsMirrored', '$mir eq "Y"', ''], | |
145 | ||
146 | # Arabic | |
147 | ||
148 | ['ArabLink', '1', '$link'], | |
149 | ['ArabLnkGrp', '1', '$linkgroup'], | |
150 | ||
151 | # Jamo | |
152 | ||
153 | ['JamoShort', '1', '$short'], | |
499bfa7a JH |
154 | |
155 | # Syllables | |
156 | ||
157 | ['IsSylV', '$syl eq "V"', ''], | |
158 | ['IsSylU', '$syl eq "U"', ''], | |
159 | ['IsSylI', '$syl eq "I"', ''], | |
160 | ['IsSylA', '$syl eq "A"', ''], | |
161 | ['IsSylE', '$syl eq "E"', ''], | |
162 | ['IsSylC', '$syl eq "C"', ''], | |
163 | ['IsSylO', '$syl eq "O"', ''], | |
164 | ['IsSylWV', '$syl eq "V"', ''], | |
165 | ['IsSylWI', '$syl eq "I"', ''], | |
166 | ['IsSylWA', '$syl eq "A"', ''], | |
167 | ['IsSylWE', '$syl eq "E"', ''], | |
168 | ['IsSylWC', '$syl eq "C"', ''], | |
a0ed51b3 LW |
169 | ); |
170 | ||
171 | # This is not written for speed... | |
172 | ||
173 | foreach $file (@todo) { | |
174 | my ($table, $wanted, $val) = @$file; | |
175 | next if @ARGV and not grep { $_ eq $table } @ARGV; | |
176 | print $table,"\n"; | |
177 | if ($table =~ /^(Is|In|To)(.*)/) { | |
178 | open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n"; | |
179 | } | |
180 | else { | |
181 | open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n"; | |
182 | } | |
14055466 JH |
183 | print OUT <<EOH; |
184 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! | |
185 | # This file is built by $0 from e.g. $UnicodeData. | |
186 | # Any changes made here will be lost! | |
187 | EOH | |
a0ed51b3 LW |
188 | print OUT <<"END"; |
189 | return <<'END'; | |
190 | END | |
191 | print OUT proplist($table, $wanted, $val); | |
192 | print OUT "END\n"; | |
193 | close OUT; | |
194 | } | |
195 | ||
196 | # Must treat blocks specially. | |
197 | ||
198 | exit if @ARGV and not grep { $_ eq Block } @ARGV; | |
199 | print "Block\n"; | |
d357d9fe | 200 | open(UD, 'Blocks.txt') or die "Can't open blocks.txt: $!\n"; |
a0ed51b3 | 201 | open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n"; |
14055466 JH |
202 | print OUT <<EOH; |
203 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! | |
204 | # This file is built by $0 from e.g. $UnicodeData. | |
205 | # Any changes made here will be lost! | |
206 | EOH | |
a0ed51b3 LW |
207 | print OUT <<"END"; |
208 | return <<'END'; | |
209 | END | |
210 | ||
211 | while (<UD>) { | |
212 | next if /^#/; | |
213 | next if /^$/; | |
214 | chomp; | |
215 | ($code, $last, $name) = split(/; */); | |
216 | if ($name) { | |
217 | print OUT "$code $last $name\n"; | |
218 | $name =~ s/\s+//g; | |
219 | open(BLOCK, ">In/$name.pl"); | |
14055466 JH |
220 | print BLOCK <<EOH; |
221 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! | |
222 | # This file is built by $0 from e.g. $UnicodeData. | |
223 | # Any changes made here will be lost! | |
224 | EOH | |
a0ed51b3 LW |
225 | print BLOCK <<"END2"; |
226 | return <<'END'; | |
227 | $code $last | |
228 | END | |
229 | END2 | |
230 | close BLOCK; | |
231 | } | |
232 | } | |
233 | ||
234 | print OUT "END\n"; | |
235 | close OUT; | |
236 | ||
237 | ################################################## | |
238 | ||
239 | sub proplist { | |
240 | my ($table, $wanted, $val) = @_; | |
241 | my @wanted; | |
242 | my $out; | |
243 | my $split; | |
244 | ||
245 | if ($table =~ /^Arab/) { | |
d357d9fe | 246 | open(UD, "ArabShap.txt") or warn "Can't open $table: $!"; |
a0ed51b3 LW |
247 | |
248 | $split = '($code, $name, $link, $linkgroup) = split(/; */);'; | |
249 | } | |
250 | elsif ($table =~ /^Jamo/) { | |
505afebf | 251 | open(UD, "Jamo.txt") or warn "Can't open $table: $!"; |
a0ed51b3 LW |
252 | |
253 | $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;'; | |
254 | } | |
499bfa7a JH |
255 | elsif ($table =~ /^IsSyl/) { |
256 | open(UD, "syllables.txt") or warn "Can't open $table: $!"; | |
257 | ||
258 | $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;'; | |
259 | } | |
a0ed51b3 | 260 | else { |
11695a73 | 261 | open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!"; |
a0ed51b3 LW |
262 | |
263 | $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1, | |
264 | $comment, $up, $down, $title) = split(/;/);'; | |
265 | } | |
266 | ||
267 | if ($table =~ /^(?:To|Is)[A-Z]/) { | |
268 | eval <<"END"; | |
269 | while (<UD>) { | |
270 | next if /^#/; | |
271 | next if /^\s/; | |
272 | chop; | |
273 | $split | |
274 | if ($wanted) { | |
275 | push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]); | |
276 | } | |
277 | } | |
278 | END | |
279 | die $@ if $@; | |
280 | ||
281 | while (@wanted) { | |
282 | $beg = shift @wanted; | |
283 | $last = $beg; | |
284 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and | |
285 | (not $val or $wanted[0]->[1] == $last->[1] + 1)) { | |
286 | $last = shift @wanted; | |
287 | } | |
288 | $out .= sprintf "%04x", $beg->[0]; | |
289 | if ($beg->[2]) { | |
290 | $last = shift @wanted; | |
291 | } | |
292 | if ($beg == $last) { | |
293 | $out .= "\t"; | |
294 | } | |
295 | else { | |
296 | $out .= sprintf "\t%04x", $last->[0]; | |
297 | } | |
298 | $out .= sprintf "\t%04x", $beg->[1] if $val; | |
299 | $out .= "\n"; | |
300 | } | |
301 | } | |
302 | else { | |
303 | eval <<"END"; | |
304 | while (<UD>) { | |
305 | next if /^#/; | |
306 | next if /^\s*\$/; | |
307 | chop; | |
308 | $split | |
309 | if ($wanted) { | |
310 | push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]); | |
311 | } | |
312 | } | |
313 | END | |
314 | die $@ if $@; | |
315 | ||
316 | while (@wanted) { | |
317 | $beg = shift @wanted; | |
318 | $last = $beg; | |
319 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and | |
320 | ($wanted[0]->[1] eq $last->[1])) { | |
321 | $last = shift @wanted; | |
322 | } | |
323 | $out .= sprintf "%04x", $beg->[0]; | |
324 | if ($beg->[2]) { | |
325 | $last = shift @wanted; | |
326 | } | |
327 | if ($beg == $last) { | |
328 | $out .= "\t"; | |
329 | } | |
330 | else { | |
331 | $out .= sprintf "\t%04x", $last->[0]; | |
332 | } | |
333 | $out .= sprintf "\t%s\n", $beg->[1]; | |
334 | } | |
335 | } | |
336 | $out; | |
337 | } | |
11695a73 | 338 | |
6dd159d1 | 339 | # eof |