Commit | Line | Data |
---|---|---|
a0ed51b3 LW |
1 | #!../../miniperl |
2 | ||
7c6f5cd2 GS |
3 | use bytes; |
4 | ||
8836d2a5 | 5 | $UnicodeData = "Unicode.301"; |
7c6f5cd2 | 6 | $SyllableData = "syllables.txt"; |
8836d2a5 | 7 | $PropData = "PropList.txt"; |
7c6f5cd2 | 8 | |
11695a73 | 9 | |
a0ed51b3 LW |
10 | # Note: we try to keep filenames unique within first 8 chars. Using |
11 | # subdirectories for the following helps. | |
289d4f09 ML |
12 | mkdir "In", 0755; |
13 | mkdir "Is", 0755; | |
14 | mkdir "To", 0755; | |
a0ed51b3 LW |
15 | |
16 | @todo = ( | |
17 | # typical | |
18 | ||
f59877d4 JH |
19 | # 005F: SPACING UNDERSCROE |
20 | ['IsWord', '$cat =~ /^[LMN]/ or $code eq "005F"', ''], | |
21 | ['IsAlnum', '$cat =~ /^[LMN]/', ''], | |
22 | ['IsAlpha', '$cat =~ /^[LM]/', ''], | |
23 | # 0009: HORIZONTAL TABULATION | |
24 | # 000A: LINE FEED | |
25 | # 000B: VERTICAL TABULATION | |
26 | # 000C: FORM FEED | |
27 | # 000D: CARRIAGE RETURN | |
3bec3564 | 28 | # 0020: SPACE |
f59877d4 JH |
29 | ['IsSpace', '$cat =~ /^Z/ || |
30 | $code =~ /^(0009|000A|000B|000C|000D)$/', ''], | |
3bec3564 JH |
31 | ['IsSpacePerl', |
32 | '$cat =~ /^Z/ || | |
33 | $code =~ /^(0009|000A|000C|000D)$/', ''], | |
34 | ['IsBlank', '$cat =~ /^Z[^lp]$/ || $code eq "0009"', ''], | |
b8c5462f | 35 | ['IsDigit', '$cat =~ /^Nd$/', ''], |
7c6f5cd2 | 36 | ['IsUpper', '$cat =~ /^L[ut]$/', ''], |
b8c5462f | 37 | ['IsLower', '$cat =~ /^Ll$/', ''], |
f59877d4 | 38 | ['IsASCII', '$code le "007f"', ''], |
b8c5462f | 39 | ['IsCntrl', '$cat =~ /^C/', ''], |
f59877d4 JH |
40 | ['IsGraph', '$cat =~ /^([LMNPS]|Co)/', ''], |
41 | ['IsPrint', '$cat =~ /^([LMNPS]|Co|Zs)/', ''], | |
42 | ['IsPunct', '$cat =~ /^P/', ''], | |
43 | # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f | |
b8c5462f JH |
44 | ['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''], |
45 | ['ToUpper', '$up', '$up'], | |
46 | ['ToLower', '$down', '$down'], | |
47 | ['ToTitle', '$title', '$title'], | |
48 | ['ToDigit', '$dec ne ""', '$dec'], | |
a0ed51b3 LW |
49 | |
50 | # Name | |
51 | ||
52 | ['Name', '$name', '$name'], | |
53 | ||
54 | # Category | |
55 | ||
56 | ['Category', '$cat', '$cat'], | |
57 | ||
58 | # Normative | |
59 | ||
60 | ['IsM', '$cat =~ /^M/', ''], # Mark | |
61 | ['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing | |
62 | ['IsMc', '$cat eq "Mc"', ''], # Mark, Combining | |
7c6f5cd2 | 63 | ['IsMe', '$cat eq "Me"', ''], # Mark, Enclosing |
a0ed51b3 LW |
64 | |
65 | ['IsN', '$cat =~ /^N/', ''], # Number | |
66 | ['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit | |
67 | ['IsNo', '$cat eq "No"', ''], # Number, Other | |
7c6f5cd2 | 68 | ['IsNl', '$cat eq "Nl"', ''], # Number, Letter |
a0ed51b3 | 69 | |
7c6f5cd2 | 70 | ['IsZ', '$cat =~ /^Z/', ''], # Separator |
a0ed51b3 LW |
71 | ['IsZs', '$cat eq "Zs"', ''], # Separator, Space |
72 | ['IsZl', '$cat eq "Zl"', ''], # Separator, Line | |
73 | ['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph | |
74 | ||
75 | ['IsC', '$cat =~ /^C/', ''], # Crazy | |
76 | ['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format | |
77 | ['IsCo', '$cat eq "Co"', ''], # Other, Private Use | |
78 | ['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned | |
7c6f5cd2 GS |
79 | ['IsCf', '$cat eq "Cf"', ''], # Other, Format |
80 | ['IsCs', '$cat eq "Cs"', ''], # Other, Surrogate | |
81 | ['IsCn', 'Unassigned Code Value',$PropData], # Other, Not Assigned | |
a0ed51b3 LW |
82 | |
83 | # Informative | |
84 | ||
85 | ['IsL', '$cat =~ /^L/', ''], # Letter | |
86 | ['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase | |
87 | ['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase | |
88 | ['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase | |
89 | ['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier | |
90 | ['IsLo', '$cat eq "Lo"', ''], # Letter, Other | |
91 | ||
92 | ['IsP', '$cat =~ /^P/', ''], # Punctuation | |
93 | ['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash | |
94 | ['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open | |
95 | ['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close | |
96 | ['IsPo', '$cat eq "Po"', ''], # Punctuation, Other | |
7c6f5cd2 GS |
97 | ['IsPc', '$cat eq "Pc"', ''], # Punctuation, Connector |
98 | ['IsPi', '$cat eq "Pi"', ''], # Punctuation, Initial quote | |
99 | ['IsPf', '$cat eq "Pf"', ''], # Punctuation, Final quote | |
a0ed51b3 LW |
100 | |
101 | ['IsS', '$cat =~ /^S/', ''], # Symbol | |
102 | ['IsSm', '$cat eq "Sm"', ''], # Symbol, Math | |
7c6f5cd2 | 103 | ['IsSk', '$cat eq "Sk"', ''], # Symbol, Modifier |
a0ed51b3 LW |
104 | ['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency |
105 | ['IsSo', '$cat eq "So"', ''], # Symbol, Other | |
106 | ||
107 | # Combining class | |
108 | ['CombiningClass', '$comb', '$comb'], | |
109 | ||
110 | # BIDIRECTIONAL PROPERTIES | |
111 | ||
112 | ['Bidirectional', '$bid', '$bid'], | |
113 | ||
114 | # Strong types: | |
115 | ||
116 | ['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic, | |
117 | # syllabic, and logographic | |
118 | # characters (e.g., CJK | |
119 | # ideographs) | |
120 | ['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew, | |
121 | # and punctuation specific to | |
122 | # those scripts | |
123 | ||
7c6f5cd2 GS |
124 | ['IsBidiLRE', '$bid eq "LRE"', ''], # Left-to-Right Embedding |
125 | ['IsBidiLRO', '$bid eq "LRO"', ''], # Left-to-Right Override | |
126 | ['IsBidiAL', '$bid eq "AL"', ''], # Right-to-Left Arabic | |
127 | ['IsBidiRLE', '$bid eq "RLE"', ''], # Right-to-Left Embedding | |
128 | ['IsBidiRLO', '$bid eq "RLO"', ''], # Right-to-Left Override | |
129 | ['IsBidiPDF', '$bid eq "PDF"', ''], # Pop Directional Format | |
130 | ['IsBidiNSM', '$bid eq "NSM"', ''], # Non-Spacing Mark | |
131 | ['IsBidiBN', '$bid eq "BN"', ''], # Boundary Neutral | |
132 | ||
a0ed51b3 LW |
133 | # Weak types: |
134 | ||
135 | ['IsBidiEN','$bid eq "EN"', ''], # European Number | |
136 | ['IsBidiES','$bid eq "ES"', ''], # European Number Separator | |
137 | ['IsBidiET','$bid eq "ET"', ''], # European Number Terminator | |
138 | ['IsBidiAN','$bid eq "AN"', ''], # Arabic Number | |
139 | ['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator | |
140 | ||
141 | # Separators: | |
142 | ||
143 | ['IsBidiB', '$bid eq "B"', ''], # Block Separator | |
144 | ['IsBidiS', '$bid eq "S"', ''], # Segment Separator | |
145 | ||
146 | # Neutrals: | |
147 | ||
148 | ['IsBidiWS','$bid eq "WS"', ''], # Whitespace | |
149 | ['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other | |
150 | # characters: punctuation, | |
151 | # symbols | |
152 | ||
153 | # Decomposition | |
154 | ||
155 | ['Decomposition', '$decomp', '$decomp'], | |
156 | ['IsDecoCanon', '$decomp && $decomp !~ /^</', ''], | |
157 | ['IsDecoCompat', '$decomp =~ /^</', ''], | |
158 | ['IsDCfont', '$decomp =~ /^<font>/', ''], | |
159 | ['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''], | |
160 | ['IsDCinitial', '$decomp =~ /^<initial>/', ''], | |
f59877d4 | 161 | ['IsDCmedial', '$decomp =~ /^<medial>/', ''], |
a0ed51b3 LW |
162 | ['IsDCfinal', '$decomp =~ /^<final>/', ''], |
163 | ['IsDCisolated', '$decomp =~ /^<isolated>/', ''], | |
164 | ['IsDCcircle', '$decomp =~ /^<circle>/', ''], | |
165 | ['IsDCsuper', '$decomp =~ /^<super>/', ''], | |
166 | ['IsDCsub', '$decomp =~ /^<sub>/', ''], | |
167 | ['IsDCvertical', '$decomp =~ /^<vertical>/', ''], | |
168 | ['IsDCwide', '$decomp =~ /^<wide>/', ''], | |
169 | ['IsDCnarrow', '$decomp =~ /^<narrow>/', ''], | |
170 | ['IsDCsmall', '$decomp =~ /^<small>/', ''], | |
171 | ['IsDCsquare', '$decomp =~ /^<square>/', ''], | |
7c6f5cd2 | 172 | ['IsDCfraction', '$decomp =~ /^<fraction>/', ''], |
a0ed51b3 LW |
173 | ['IsDCcompat', '$decomp =~ /^<compat>/', ''], |
174 | ||
175 | # Number | |
176 | ||
e1b504a6 | 177 | ['Number', '$num ne ""', '$num'], |
a0ed51b3 LW |
178 | |
179 | # Mirrored | |
180 | ||
181 | ['IsMirrored', '$mir eq "Y"', ''], | |
182 | ||
183 | # Arabic | |
184 | ||
185 | ['ArabLink', '1', '$link'], | |
186 | ['ArabLnkGrp', '1', '$linkgroup'], | |
187 | ||
188 | # Jamo | |
189 | ||
190 | ['JamoShort', '1', '$short'], | |
499bfa7a JH |
191 | |
192 | # Syllables | |
193 | ||
7c6f5cd2 GS |
194 | syllable_defs(), |
195 | ||
a77b4ae5 GS |
196 | # Line break properties - Normative |
197 | ||
198 | ['IsLbrkBK','$brk eq "BK"', ''], # Mandatory Break | |
199 | ['IsLbrkCR','$brk eq "CR"', ''], # Carriage Return | |
200 | ['IsLbrkLF','$brk eq "LF"', ''], # Line Feed | |
201 | ['IsLbrkCM','$brk eq "CM"', ''], # Attached Characters and Combining Marks | |
202 | ['IsLbrkSG','$brk eq "SG"', ''], # Surrogates | |
203 | ['IsLbrkGL','$brk eq "GL"', ''], # Non-breaking (Glue) | |
204 | ['IsLbrkCB','$brk eq "CB"', ''], # Contingent Break Opportunity | |
205 | ['IsLbrkSP','$brk eq "SP"', ''], # Space | |
206 | ['IsLbrkZW','$brk eq "ZW"', ''], # Zero Width Space | |
207 | ||
208 | # Line break properties - Informative | |
209 | ['IsLbrkXX','$brk eq "XX"', ''], # Unknown | |
210 | ['IsLbrkOP','$brk eq "OP"', ''], # Opening Punctuation | |
211 | ['IsLbrkCL','$brk eq "CL"', ''], # Closing Punctuation | |
212 | ['IsLbrkQU','$brk eq "QU"', ''], # Ambiguous Quotation | |
213 | ['IsLbrkNS','$brk eq "NS"', ''], # Non Starter | |
214 | ['IsLbrkEX','$brk eq "EX"', ''], # Exclamation/Interrogation | |
215 | ['IsLbrkSY','$brk eq "SY"', ''], # Symbols Allowing Breaks | |
216 | ['IsLbrkIS','$brk eq "IS"', ''], # Infix Separator (Numeric) | |
217 | ['IsLbrkPR','$brk eq "PR"', ''], # Prefix (Numeric) | |
218 | ['IsLbrkPO','$brk eq "PO"', ''], # Postfix (Numeric) | |
219 | ['IsLbrkNU','$brk eq "NU"', ''], # Numeric | |
220 | ['IsLbrkAL','$brk eq "AL"', ''], # Ordinary Alphabetic and Symbol Characters | |
221 | ['IsLbrkID','$brk eq "ID"', ''], # Ideographic | |
222 | ['IsLbrkIN','$brk eq "IN"', ''], # Inseparable | |
223 | ['IsLbrkHY','$brk eq "HY"', ''], # Hyphen | |
224 | ['IsLbrkBB','$brk eq "BB"', ''], # Break Opportunity Before | |
225 | ['IsLbrkBA','$brk eq "BA"', ''], # Break Opportunity After | |
226 | ['IsLbrkSA','$brk eq "SA"', ''], # Complex Context (South East Asian) | |
227 | ['IsLbrkAI','$brk eq "AI"', ''], # Ambiguous (Alphabetic or Ideographic) | |
228 | ['IsLbrkB2','$brk eq "B2"', ''], # Break Opportunity Before and After | |
a0ed51b3 LW |
229 | ); |
230 | ||
231 | # This is not written for speed... | |
232 | ||
233 | foreach $file (@todo) { | |
234 | my ($table, $wanted, $val) = @$file; | |
235 | next if @ARGV and not grep { $_ eq $table } @ARGV; | |
236 | print $table,"\n"; | |
237 | if ($table =~ /^(Is|In|To)(.*)/) { | |
238 | open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n"; | |
239 | } | |
240 | else { | |
241 | open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n"; | |
242 | } | |
14055466 JH |
243 | print OUT <<EOH; |
244 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! | |
245 | # This file is built by $0 from e.g. $UnicodeData. | |
246 | # Any changes made here will be lost! | |
247 | EOH | |
a0ed51b3 LW |
248 | print OUT <<"END"; |
249 | return <<'END'; | |
250 | END | |
251 | print OUT proplist($table, $wanted, $val); | |
252 | print OUT "END\n"; | |
253 | close OUT; | |
254 | } | |
255 | ||
256 | # Must treat blocks specially. | |
257 | ||
258 | exit if @ARGV and not grep { $_ eq Block } @ARGV; | |
259 | print "Block\n"; | |
7c6f5cd2 GS |
260 | open(UD, 'Blocks.txt') or die "Can't open Blocks.txt: $!\n"; |
261 | open(OUT, ">Block.pl") or die "Can't create Block.pl: $!\n"; | |
14055466 JH |
262 | print OUT <<EOH; |
263 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! | |
264 | # This file is built by $0 from e.g. $UnicodeData. | |
265 | # Any changes made here will be lost! | |
266 | EOH | |
a0ed51b3 LW |
267 | print OUT <<"END"; |
268 | return <<'END'; | |
269 | END | |
270 | ||
271 | while (<UD>) { | |
272 | next if /^#/; | |
273 | next if /^$/; | |
274 | chomp; | |
275 | ($code, $last, $name) = split(/; */); | |
276 | if ($name) { | |
277 | print OUT "$code $last $name\n"; | |
278 | $name =~ s/\s+//g; | |
279 | open(BLOCK, ">In/$name.pl"); | |
14055466 JH |
280 | print BLOCK <<EOH; |
281 | # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! | |
282 | # This file is built by $0 from e.g. $UnicodeData. | |
283 | # Any changes made here will be lost! | |
284 | EOH | |
a0ed51b3 LW |
285 | print BLOCK <<"END2"; |
286 | return <<'END'; | |
287 | $code $last | |
288 | END | |
289 | END2 | |
290 | close BLOCK; | |
291 | } | |
292 | } | |
293 | ||
294 | print OUT "END\n"; | |
295 | close OUT; | |
296 | ||
297 | ################################################## | |
298 | ||
299 | sub proplist { | |
300 | my ($table, $wanted, $val) = @_; | |
301 | my @wanted; | |
302 | my $out; | |
303 | my $split; | |
304 | ||
7c6f5cd2 GS |
305 | return listFromPropFile($wanted) if $val eq $PropData; |
306 | ||
a0ed51b3 | 307 | if ($table =~ /^Arab/) { |
d357d9fe | 308 | open(UD, "ArabShap.txt") or warn "Can't open $table: $!"; |
a0ed51b3 LW |
309 | |
310 | $split = '($code, $name, $link, $linkgroup) = split(/; */);'; | |
311 | } | |
312 | elsif ($table =~ /^Jamo/) { | |
505afebf | 313 | open(UD, "Jamo.txt") or warn "Can't open $table: $!"; |
a0ed51b3 LW |
314 | |
315 | $split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;'; | |
316 | } | |
499bfa7a | 317 | elsif ($table =~ /^IsSyl/) { |
7c6f5cd2 | 318 | open(UD, $SyllableData) or warn "Can't open $table: $!"; |
499bfa7a JH |
319 | |
320 | $split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;'; | |
321 | } | |
a77b4ae5 GS |
322 | elsif ($table =~ /^IsLbrk/) { |
323 | open(UD, "LineBrk.txt") or warn "Can't open $table: $!"; | |
324 | ||
325 | $split = '($code, $brk, $name) = split(/;/);'; | |
326 | } | |
a0ed51b3 | 327 | else { |
11695a73 | 328 | open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!"; |
a0ed51b3 LW |
329 | |
330 | $split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1, | |
331 | $comment, $up, $down, $title) = split(/;/);'; | |
332 | } | |
333 | ||
334 | if ($table =~ /^(?:To|Is)[A-Z]/) { | |
335 | eval <<"END"; | |
336 | while (<UD>) { | |
337 | next if /^#/; | |
7c6f5cd2 GS |
338 | next if /^\\s/; |
339 | s/\\s+\$//; | |
a0ed51b3 LW |
340 | $split |
341 | if ($wanted) { | |
342 | push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]); | |
343 | } | |
344 | } | |
345 | END | |
346 | die $@ if $@; | |
347 | ||
348 | while (@wanted) { | |
349 | $beg = shift @wanted; | |
350 | $last = $beg; | |
351 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and | |
352 | (not $val or $wanted[0]->[1] == $last->[1] + 1)) { | |
353 | $last = shift @wanted; | |
354 | } | |
355 | $out .= sprintf "%04x", $beg->[0]; | |
356 | if ($beg->[2]) { | |
357 | $last = shift @wanted; | |
358 | } | |
359 | if ($beg == $last) { | |
360 | $out .= "\t"; | |
361 | } | |
362 | else { | |
363 | $out .= sprintf "\t%04x", $last->[0]; | |
364 | } | |
365 | $out .= sprintf "\t%04x", $beg->[1] if $val; | |
366 | $out .= "\n"; | |
367 | } | |
368 | } | |
369 | else { | |
370 | eval <<"END"; | |
371 | while (<UD>) { | |
372 | next if /^#/; | |
7c6f5cd2 | 373 | next if /^\\s*\$/; |
a0ed51b3 LW |
374 | chop; |
375 | $split | |
376 | if ($wanted) { | |
377 | push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]); | |
378 | } | |
379 | } | |
380 | END | |
381 | die $@ if $@; | |
382 | ||
383 | while (@wanted) { | |
384 | $beg = shift @wanted; | |
385 | $last = $beg; | |
386 | while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and | |
387 | ($wanted[0]->[1] eq $last->[1])) { | |
388 | $last = shift @wanted; | |
389 | } | |
390 | $out .= sprintf "%04x", $beg->[0]; | |
391 | if ($beg->[2]) { | |
392 | $last = shift @wanted; | |
393 | } | |
394 | if ($beg == $last) { | |
395 | $out .= "\t"; | |
396 | } | |
397 | else { | |
398 | $out .= sprintf "\t%04x", $last->[0]; | |
399 | } | |
400 | $out .= sprintf "\t%s\n", $beg->[1]; | |
401 | } | |
402 | } | |
403 | $out; | |
404 | } | |
11695a73 | 405 | |
7c6f5cd2 GS |
406 | sub listFromPropFile { |
407 | my ($wanted) = @_; | |
408 | my $out; | |
409 | ||
410 | open (UD, $PropData) or die "Can't open $PropData: $!\n"; | |
411 | local($/) = "\n" . '*' x 43 . "\n\nProperty dump for:"; # not 42? | |
412 | ||
413 | <UD>; | |
414 | while (<UD>) { | |
415 | chomp; | |
416 | if (s/0x[\d\w]+\s+\((.*?)\)// and $wanted eq $1) { | |
417 | s/\(\d+ chars\)//g; | |
418 | s/^\s+//mg; | |
419 | s/\s+$//mg; | |
420 | s/\.\./\t/g; | |
421 | $out = lc $_; | |
422 | last; | |
423 | } | |
424 | } | |
425 | close (UD); | |
426 | "$out\n"; | |
427 | } | |
428 | ||
429 | sub syllable_defs { | |
430 | my @defs; | |
431 | my %seen; | |
432 | ||
433 | open (SD, $SyllableData) or die "Can't open $SyllableData: $!\n"; | |
434 | while (<SD>) { | |
435 | next if /^\s*(#|$)/; | |
436 | s/\s+$//; | |
437 | ($code, $name, $syl) = split /; */; | |
438 | next unless $syl; | |
439 | push (@defs, ["IsSyl$syl", qq{\$syl eq "$syl"}, '']) | |
440 | unless $seen{$syl}++; | |
441 | } | |
442 | close (SD); | |
443 | return (@defs); | |
444 | } | |
445 | ||
6dd159d1 | 446 | # eof |