Commit | Line | Data |
---|---|---|
25a47338 | 1 | #!perl -w |
8b731da2 | 2 | BEGIN { |
a452d459 | 3 | if (ord("A") != 65) { |
8b731da2 JH |
4 | print "1..0 # Skip: EBCDIC\n"; |
5 | exit 0; | |
6 | } | |
a778afa6 AD |
7 | chdir 't' if -d 't'; |
8 | @INC = '../lib'; | |
25a47338 NC |
9 | require Config; import Config; |
10 | if ($Config{'extensions'} !~ /\bStorable\b/) { | |
11 | print "1..0 # Skip: Storable was not built; Unicode::UCD uses Storable\n"; | |
12 | exit 0; | |
13 | } | |
8b731da2 JH |
14 | } |
15 | ||
a778afa6 AD |
16 | use strict; |
17 | use Unicode::UCD; | |
f5c9f3db | 18 | use Test::More; |
8b731da2 | 19 | |
55d7b906 | 20 | use Unicode::UCD 'charinfo'; |
561c79ed | 21 | |
6bbe4a24 KW |
22 | my $input_record_separator = 7; # Make sure Unicode::UCD isn't affected by |
23 | $/ = $input_record_separator; # setting this. | |
4f1530fd | 24 | |
b08cd201 JH |
25 | my $charinfo; |
26 | ||
9e92970c KW |
27 | is(charinfo(0x110000), undef, "Verify charinfo() of non-unicode is undef"); |
28 | ||
e10d7780 KW |
29 | $charinfo = charinfo(0); # Null is often problematic, so test it. |
30 | ||
31 | is($charinfo->{code}, '0000', '<control>'); | |
32 | is($charinfo->{name}, '<control>'); | |
33 | is($charinfo->{category}, 'Cc'); | |
34 | is($charinfo->{combining}, '0'); | |
35 | is($charinfo->{bidi}, 'BN'); | |
36 | is($charinfo->{decomposition}, ''); | |
37 | is($charinfo->{decimal}, ''); | |
38 | is($charinfo->{digit}, ''); | |
39 | is($charinfo->{numeric}, ''); | |
40 | is($charinfo->{mirrored}, 'N'); | |
41 | is($charinfo->{unicode10}, 'NULL'); | |
42 | is($charinfo->{comment}, ''); | |
43 | is($charinfo->{upper}, ''); | |
44 | is($charinfo->{lower}, ''); | |
45 | is($charinfo->{title}, ''); | |
46 | is($charinfo->{block}, 'Basic Latin'); | |
47 | is($charinfo->{script}, 'Common'); | |
48 | ||
b08cd201 JH |
49 | $charinfo = charinfo(0x41); |
50 | ||
f5c9f3db JH |
51 | is($charinfo->{code}, '0041', 'LATIN CAPITAL LETTER A'); |
52 | is($charinfo->{name}, 'LATIN CAPITAL LETTER A'); | |
53 | is($charinfo->{category}, 'Lu'); | |
54 | is($charinfo->{combining}, '0'); | |
55 | is($charinfo->{bidi}, 'L'); | |
56 | is($charinfo->{decomposition}, ''); | |
57 | is($charinfo->{decimal}, ''); | |
58 | is($charinfo->{digit}, ''); | |
59 | is($charinfo->{numeric}, ''); | |
60 | is($charinfo->{mirrored}, 'N'); | |
61 | is($charinfo->{unicode10}, ''); | |
62 | is($charinfo->{comment}, ''); | |
63 | is($charinfo->{upper}, ''); | |
64 | is($charinfo->{lower}, '0061'); | |
65 | is($charinfo->{title}, ''); | |
66 | is($charinfo->{block}, 'Basic Latin'); | |
67 | is($charinfo->{script}, 'Latin'); | |
b08cd201 JH |
68 | |
69 | $charinfo = charinfo(0x100); | |
70 | ||
f5c9f3db JH |
71 | is($charinfo->{code}, '0100', 'LATIN CAPITAL LETTER A WITH MACRON'); |
72 | is($charinfo->{name}, 'LATIN CAPITAL LETTER A WITH MACRON'); | |
73 | is($charinfo->{category}, 'Lu'); | |
74 | is($charinfo->{combining}, '0'); | |
75 | is($charinfo->{bidi}, 'L'); | |
76 | is($charinfo->{decomposition}, '0041 0304'); | |
77 | is($charinfo->{decimal}, ''); | |
78 | is($charinfo->{digit}, ''); | |
79 | is($charinfo->{numeric}, ''); | |
80 | is($charinfo->{mirrored}, 'N'); | |
81 | is($charinfo->{unicode10}, 'LATIN CAPITAL LETTER A MACRON'); | |
82 | is($charinfo->{comment}, ''); | |
83 | is($charinfo->{upper}, ''); | |
84 | is($charinfo->{lower}, '0101'); | |
85 | is($charinfo->{title}, ''); | |
86 | is($charinfo->{block}, 'Latin Extended-A'); | |
87 | is($charinfo->{script}, 'Latin'); | |
a196fbfd JH |
88 | |
89 | # 0x0590 is in the Hebrew block but unused. | |
561c79ed | 90 | |
b08cd201 JH |
91 | $charinfo = charinfo(0x590); |
92 | ||
f5c9f3db JH |
93 | is($charinfo->{code}, undef, '0x0590 - unused Hebrew'); |
94 | is($charinfo->{name}, undef); | |
95 | is($charinfo->{category}, undef); | |
96 | is($charinfo->{combining}, undef); | |
97 | is($charinfo->{bidi}, undef); | |
98 | is($charinfo->{decomposition}, undef); | |
99 | is($charinfo->{decimal}, undef); | |
100 | is($charinfo->{digit}, undef); | |
101 | is($charinfo->{numeric}, undef); | |
102 | is($charinfo->{mirrored}, undef); | |
103 | is($charinfo->{unicode10}, undef); | |
104 | is($charinfo->{comment}, undef); | |
105 | is($charinfo->{upper}, undef); | |
106 | is($charinfo->{lower}, undef); | |
107 | is($charinfo->{title}, undef); | |
108 | is($charinfo->{block}, undef); | |
109 | is($charinfo->{script}, undef); | |
a196fbfd JH |
110 | |
111 | # 0x05d0 is in the Hebrew block and used. | |
561c79ed | 112 | |
b08cd201 JH |
113 | $charinfo = charinfo(0x5d0); |
114 | ||
f5c9f3db JH |
115 | is($charinfo->{code}, '05D0', '05D0 - used Hebrew'); |
116 | is($charinfo->{name}, 'HEBREW LETTER ALEF'); | |
117 | is($charinfo->{category}, 'Lo'); | |
118 | is($charinfo->{combining}, '0'); | |
119 | is($charinfo->{bidi}, 'R'); | |
120 | is($charinfo->{decomposition}, ''); | |
121 | is($charinfo->{decimal}, ''); | |
122 | is($charinfo->{digit}, ''); | |
123 | is($charinfo->{numeric}, ''); | |
124 | is($charinfo->{mirrored}, 'N'); | |
125 | is($charinfo->{unicode10}, ''); | |
126 | is($charinfo->{comment}, ''); | |
127 | is($charinfo->{upper}, ''); | |
128 | is($charinfo->{lower}, ''); | |
129 | is($charinfo->{title}, ''); | |
130 | is($charinfo->{block}, 'Hebrew'); | |
131 | is($charinfo->{script}, 'Hebrew'); | |
561c79ed | 132 | |
74f8133e | 133 | # An open syllable in Hangul. |
a6fa416b TS |
134 | |
135 | $charinfo = charinfo(0xAC00); | |
136 | ||
05dbc6f8 KW |
137 | is($charinfo->{code}, 'AC00', 'HANGUL SYLLABLE U+AC00'); |
138 | is($charinfo->{name}, 'HANGUL SYLLABLE GA'); | |
f5c9f3db JH |
139 | is($charinfo->{category}, 'Lo'); |
140 | is($charinfo->{combining}, '0'); | |
141 | is($charinfo->{bidi}, 'L'); | |
05dbc6f8 | 142 | is($charinfo->{decomposition}, '1100 1161'); |
f5c9f3db JH |
143 | is($charinfo->{decimal}, ''); |
144 | is($charinfo->{digit}, ''); | |
145 | is($charinfo->{numeric}, ''); | |
146 | is($charinfo->{mirrored}, 'N'); | |
147 | is($charinfo->{unicode10}, ''); | |
148 | is($charinfo->{comment}, ''); | |
149 | is($charinfo->{upper}, ''); | |
150 | is($charinfo->{lower}, ''); | |
151 | is($charinfo->{title}, ''); | |
152 | is($charinfo->{block}, 'Hangul Syllables'); | |
153 | is($charinfo->{script}, 'Hangul'); | |
a6fa416b | 154 | |
74f8133e | 155 | # A closed syllable in Hangul. |
a6fa416b TS |
156 | |
157 | $charinfo = charinfo(0xAE00); | |
158 | ||
05dbc6f8 KW |
159 | is($charinfo->{code}, 'AE00', 'HANGUL SYLLABLE U+AE00'); |
160 | is($charinfo->{name}, 'HANGUL SYLLABLE GEUL'); | |
f5c9f3db JH |
161 | is($charinfo->{category}, 'Lo'); |
162 | is($charinfo->{combining}, '0'); | |
163 | is($charinfo->{bidi}, 'L'); | |
05dbc6f8 | 164 | is($charinfo->{decomposition}, "1100 1173 11AF"); |
f5c9f3db JH |
165 | is($charinfo->{decimal}, ''); |
166 | is($charinfo->{digit}, ''); | |
167 | is($charinfo->{numeric}, ''); | |
168 | is($charinfo->{mirrored}, 'N'); | |
169 | is($charinfo->{unicode10}, ''); | |
170 | is($charinfo->{comment}, ''); | |
171 | is($charinfo->{upper}, ''); | |
172 | is($charinfo->{lower}, ''); | |
173 | is($charinfo->{title}, ''); | |
174 | is($charinfo->{block}, 'Hangul Syllables'); | |
175 | is($charinfo->{script}, 'Hangul'); | |
a6fa416b TS |
176 | |
177 | $charinfo = charinfo(0x1D400); | |
178 | ||
f5c9f3db JH |
179 | is($charinfo->{code}, '1D400', 'MATHEMATICAL BOLD CAPITAL A'); |
180 | is($charinfo->{name}, 'MATHEMATICAL BOLD CAPITAL A'); | |
181 | is($charinfo->{category}, 'Lu'); | |
182 | is($charinfo->{combining}, '0'); | |
183 | is($charinfo->{bidi}, 'L'); | |
184 | is($charinfo->{decomposition}, '<font> 0041'); | |
185 | is($charinfo->{decimal}, ''); | |
186 | is($charinfo->{digit}, ''); | |
187 | is($charinfo->{numeric}, ''); | |
188 | is($charinfo->{mirrored}, 'N'); | |
189 | is($charinfo->{unicode10}, ''); | |
190 | is($charinfo->{comment}, ''); | |
191 | is($charinfo->{upper}, ''); | |
192 | is($charinfo->{lower}, ''); | |
193 | is($charinfo->{title}, ''); | |
194 | is($charinfo->{block}, 'Mathematical Alphanumeric Symbols'); | |
7be0dac3 | 195 | is($charinfo->{script}, 'Common'); |
a6fa416b | 196 | |
a452d459 KW |
197 | $charinfo = charinfo(0x9FBA); #Bug 58428 |
198 | ||
199 | is($charinfo->{code}, '9FBA', 'U+9FBA'); | |
200 | is($charinfo->{name}, 'CJK UNIFIED IDEOGRAPH-9FBA'); | |
201 | is($charinfo->{category}, 'Lo'); | |
202 | is($charinfo->{combining}, '0'); | |
203 | is($charinfo->{bidi}, 'L'); | |
204 | is($charinfo->{decomposition}, ''); | |
205 | is($charinfo->{decimal}, ''); | |
206 | is($charinfo->{digit}, ''); | |
207 | is($charinfo->{numeric}, ''); | |
208 | is($charinfo->{mirrored}, 'N'); | |
209 | is($charinfo->{unicode10}, ''); | |
210 | is($charinfo->{comment}, ''); | |
211 | is($charinfo->{upper}, ''); | |
212 | is($charinfo->{lower}, ''); | |
213 | is($charinfo->{title}, ''); | |
214 | is($charinfo->{block}, 'CJK Unified Ideographs'); | |
215 | is($charinfo->{script}, 'Han'); | |
216 | ||
55d7b906 | 217 | use Unicode::UCD qw(charblock charscript); |
a196fbfd JH |
218 | |
219 | # 0x0590 is in the Hebrew block but unused. | |
561c79ed | 220 | |
f5c9f3db | 221 | is(charblock(0x590), 'Hebrew', '0x0590 - Hebrew unused charblock'); |
8079ad82 | 222 | is(charscript(0x590), 'Unknown', '0x0590 - Hebrew unused charscript'); |
c707cf8e | 223 | is(charblock(0x1FFFF), 'No_Block', '0x1FFFF - unused charblock'); |
561c79ed | 224 | |
b08cd201 JH |
225 | $charinfo = charinfo(0xbe); |
226 | ||
f5c9f3db JH |
227 | is($charinfo->{code}, '00BE', 'VULGAR FRACTION THREE QUARTERS'); |
228 | is($charinfo->{name}, 'VULGAR FRACTION THREE QUARTERS'); | |
229 | is($charinfo->{category}, 'No'); | |
230 | is($charinfo->{combining}, '0'); | |
231 | is($charinfo->{bidi}, 'ON'); | |
232 | is($charinfo->{decomposition}, '<fraction> 0033 2044 0034'); | |
233 | is($charinfo->{decimal}, ''); | |
234 | is($charinfo->{digit}, ''); | |
235 | is($charinfo->{numeric}, '3/4'); | |
236 | is($charinfo->{mirrored}, 'N'); | |
237 | is($charinfo->{unicode10}, 'FRACTION THREE QUARTERS'); | |
238 | is($charinfo->{comment}, ''); | |
239 | is($charinfo->{upper}, ''); | |
240 | is($charinfo->{lower}, ''); | |
241 | is($charinfo->{title}, ''); | |
242 | is($charinfo->{block}, 'Latin-1 Supplement'); | |
7be0dac3 | 243 | is($charinfo->{script}, 'Common'); |
10a6ecd2 | 244 | |
7fcb6f68 KW |
245 | # This is to test a case where both simple and full lowercases exist and |
246 | # differ | |
247 | $charinfo = charinfo(0x130); | |
248 | ||
249 | is($charinfo->{code}, '0130', 'LATIN CAPITAL LETTER I WITH DOT ABOVE'); | |
250 | is($charinfo->{name}, 'LATIN CAPITAL LETTER I WITH DOT ABOVE'); | |
251 | is($charinfo->{category}, 'Lu'); | |
252 | is($charinfo->{combining}, '0'); | |
253 | is($charinfo->{bidi}, 'L'); | |
254 | is($charinfo->{decomposition}, '0049 0307'); | |
255 | is($charinfo->{decimal}, ''); | |
256 | is($charinfo->{digit}, ''); | |
257 | is($charinfo->{numeric}, ''); | |
258 | is($charinfo->{mirrored}, 'N'); | |
259 | is($charinfo->{unicode10}, 'LATIN CAPITAL LETTER I DOT'); | |
260 | is($charinfo->{comment}, ''); | |
261 | is($charinfo->{upper}, ''); | |
262 | is($charinfo->{lower}, '0069'); | |
263 | is($charinfo->{title}, ''); | |
264 | is($charinfo->{block}, 'Latin Extended-A'); | |
265 | is($charinfo->{script}, 'Latin'); | |
266 | ||
267 | # This is to test a case where both simple and full uppercases exist and | |
268 | # differ | |
269 | $charinfo = charinfo(0x1F80); | |
270 | ||
271 | is($charinfo->{code}, '1F80', 'GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI'); | |
272 | is($charinfo->{name}, 'GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI'); | |
273 | is($charinfo->{category}, 'Ll'); | |
274 | is($charinfo->{combining}, '0'); | |
275 | is($charinfo->{bidi}, 'L'); | |
276 | is($charinfo->{decomposition}, '1F00 0345'); | |
277 | is($charinfo->{decimal}, ''); | |
278 | is($charinfo->{digit}, ''); | |
279 | is($charinfo->{numeric}, ''); | |
280 | is($charinfo->{mirrored}, 'N'); | |
281 | is($charinfo->{unicode10}, ''); | |
282 | is($charinfo->{comment}, ''); | |
283 | is($charinfo->{upper}, '1F88'); | |
284 | is($charinfo->{lower}, ''); | |
285 | is($charinfo->{title}, '1F88'); | |
286 | is($charinfo->{block}, 'Greek Extended'); | |
287 | is($charinfo->{script}, 'Greek'); | |
288 | ||
55d7b906 | 289 | use Unicode::UCD qw(charblocks charscripts); |
10a6ecd2 | 290 | |
b08cd201 | 291 | my $charblocks = charblocks(); |
10a6ecd2 | 292 | |
f5c9f3db JH |
293 | ok(exists $charblocks->{Thai}, 'Thai charblock exists'); |
294 | is($charblocks->{Thai}->[0]->[0], hex('0e00')); | |
295 | ok(!exists $charblocks->{PigLatin}, 'PigLatin charblock does not exist'); | |
10a6ecd2 | 296 | |
b08cd201 | 297 | my $charscripts = charscripts(); |
10a6ecd2 | 298 | |
f5c9f3db JH |
299 | ok(exists $charscripts->{Armenian}, 'Armenian charscript exists'); |
300 | is($charscripts->{Armenian}->[0]->[0], hex('0531')); | |
301 | ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist'); | |
10a6ecd2 JH |
302 | |
303 | my $charscript; | |
304 | ||
305 | $charscript = charscript("12ab"); | |
f5c9f3db | 306 | is($charscript, 'Ethiopic', 'Ethiopic charscript'); |
10a6ecd2 JH |
307 | |
308 | $charscript = charscript("0x12ab"); | |
f5c9f3db | 309 | is($charscript, 'Ethiopic'); |
10a6ecd2 JH |
310 | |
311 | $charscript = charscript("U+12ab"); | |
f5c9f3db | 312 | is($charscript, 'Ethiopic'); |
10a6ecd2 JH |
313 | |
314 | my $ranges; | |
315 | ||
316 | $ranges = charscript('Ogham'); | |
7bccef0b KW |
317 | is($ranges->[0]->[0], hex('1680'), 'Ogham charscript'); |
318 | is($ranges->[0]->[1], hex('169C')); | |
10a6ecd2 | 319 | |
55d7b906 | 320 | use Unicode::UCD qw(charinrange); |
10a6ecd2 JH |
321 | |
322 | $ranges = charscript('Cherokee'); | |
f5c9f3db | 323 | ok(!charinrange($ranges, "139f"), 'Cherokee charscript'); |
10a6ecd2 JH |
324 | ok( charinrange($ranges, "13a0")); |
325 | ok( charinrange($ranges, "13f4")); | |
326 | ok(!charinrange($ranges, "13f5")); | |
327 | ||
ea508aee JH |
328 | use Unicode::UCD qw(general_categories); |
329 | ||
330 | my $gc = general_categories(); | |
331 | ||
332 | ok(exists $gc->{L}, 'has L'); | |
333 | is($gc->{L}, 'Letter', 'L is Letter'); | |
334 | is($gc->{Lu}, 'UppercaseLetter', 'Lu is UppercaseLetter'); | |
335 | ||
336 | use Unicode::UCD qw(bidi_types); | |
337 | ||
338 | my $bt = bidi_types(); | |
339 | ||
340 | ok(exists $bt->{L}, 'has L'); | |
341 | is($bt->{L}, 'Left-to-Right', 'L is Left-to-Right'); | |
342 | is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic'); | |
343 | ||
a452d459 KW |
344 | # If this fails, then maybe one should look at the Unicode changes to see |
345 | # what else might need to be updated. | |
83d881f0 | 346 | is(Unicode::UCD::UnicodeVersion, '6.2.0', 'UnicodeVersion'); |
b08cd201 | 347 | |
55d7b906 | 348 | use Unicode::UCD qw(compexcl); |
b08cd201 | 349 | |
f5c9f3db | 350 | ok(!compexcl(0x0100), 'compexcl'); |
71a442a8 KW |
351 | ok(!compexcl(0xD801), 'compexcl of surrogate'); |
352 | ok(!compexcl(0x110000), 'compexcl of non-Unicode code point'); | |
b08cd201 JH |
353 | ok( compexcl(0x0958)); |
354 | ||
55d7b906 | 355 | use Unicode::UCD qw(casefold); |
b08cd201 JH |
356 | |
357 | my $casefold; | |
358 | ||
359 | $casefold = casefold(0x41); | |
360 | ||
a452d459 KW |
361 | is($casefold->{code}, '0041', 'casefold 0x41 code'); |
362 | is($casefold->{status}, 'C', 'casefold 0x41 status'); | |
363 | is($casefold->{mapping}, '0061', 'casefold 0x41 mapping'); | |
364 | is($casefold->{full}, '0061', 'casefold 0x41 full'); | |
365 | is($casefold->{simple}, '0061', 'casefold 0x41 simple'); | |
366 | is($casefold->{turkic}, "", 'casefold 0x41 turkic'); | |
b08cd201 JH |
367 | |
368 | $casefold = casefold(0xdf); | |
369 | ||
a452d459 KW |
370 | is($casefold->{code}, '00DF', 'casefold 0xDF code'); |
371 | is($casefold->{status}, 'F', 'casefold 0xDF status'); | |
372 | is($casefold->{mapping}, '0073 0073', 'casefold 0xDF mapping'); | |
373 | is($casefold->{full}, '0073 0073', 'casefold 0xDF full'); | |
374 | is($casefold->{simple}, "", 'casefold 0xDF simple'); | |
375 | is($casefold->{turkic}, "", 'casefold 0xDF turkic'); | |
376 | ||
863a4fdf KW |
377 | # Do different tests depending on if version < 3.2, or not. |
378 | my $v_unicode_version = pack "C*", split /\./, Unicode::UCD::UnicodeVersion(); | |
379 | if ($v_unicode_version lt v3.2.0) { | |
a452d459 KW |
380 | $casefold = casefold(0x130); |
381 | ||
382 | is($casefold->{code}, '0130', 'casefold 0x130 code'); | |
383 | is($casefold->{status}, 'I' , 'casefold 0x130 status'); | |
384 | is($casefold->{mapping}, '0069', 'casefold 0x130 mapping'); | |
385 | is($casefold->{full}, '0069', 'casefold 0x130 full'); | |
386 | is($casefold->{simple}, "0069", 'casefold 0x130 simple'); | |
387 | is($casefold->{turkic}, "0069", 'casefold 0x130 turkic'); | |
388 | ||
389 | $casefold = casefold(0x131); | |
390 | ||
391 | is($casefold->{code}, '0131', 'casefold 0x131 code'); | |
392 | is($casefold->{status}, 'I' , 'casefold 0x131 status'); | |
393 | is($casefold->{mapping}, '0069', 'casefold 0x131 mapping'); | |
394 | is($casefold->{full}, '0069', 'casefold 0x131 full'); | |
395 | is($casefold->{simple}, "0069", 'casefold 0x131 simple'); | |
396 | is($casefold->{turkic}, "0069", 'casefold 0x131 turkic'); | |
397 | } else { | |
398 | $casefold = casefold(0x49); | |
399 | ||
400 | is($casefold->{code}, '0049', 'casefold 0x49 code'); | |
401 | is($casefold->{status}, 'C' , 'casefold 0x49 status'); | |
402 | is($casefold->{mapping}, '0069', 'casefold 0x49 mapping'); | |
403 | is($casefold->{full}, '0069', 'casefold 0x49 full'); | |
404 | is($casefold->{simple}, "0069", 'casefold 0x49 simple'); | |
405 | is($casefold->{turkic}, "0131", 'casefold 0x49 turkic'); | |
406 | ||
407 | $casefold = casefold(0x130); | |
408 | ||
409 | is($casefold->{code}, '0130', 'casefold 0x130 code'); | |
410 | is($casefold->{status}, 'F' , 'casefold 0x130 status'); | |
411 | is($casefold->{mapping}, '0069 0307', 'casefold 0x130 mapping'); | |
412 | is($casefold->{full}, '0069 0307', 'casefold 0x130 full'); | |
413 | is($casefold->{simple}, "", 'casefold 0x130 simple'); | |
414 | is($casefold->{turkic}, "0069", 'casefold 0x130 turkic'); | |
415 | } | |
416 | ||
417 | $casefold = casefold(0x1F88); | |
418 | ||
419 | is($casefold->{code}, '1F88', 'casefold 0x1F88 code'); | |
420 | is($casefold->{status}, 'S' , 'casefold 0x1F88 status'); | |
421 | is($casefold->{mapping}, '1F80', 'casefold 0x1F88 mapping'); | |
422 | is($casefold->{full}, '1F00 03B9', 'casefold 0x1F88 full'); | |
423 | is($casefold->{simple}, '1F80', 'casefold 0x1F88 simple'); | |
424 | is($casefold->{turkic}, "", 'casefold 0x1F88 turkic'); | |
b08cd201 JH |
425 | |
426 | ok(!casefold(0x20)); | |
427 | ||
55d7b906 | 428 | use Unicode::UCD qw(casespec); |
b08cd201 JH |
429 | |
430 | my $casespec; | |
431 | ||
432 | ok(!casespec(0x41)); | |
433 | ||
434 | $casespec = casespec(0xdf); | |
435 | ||
436 | ok($casespec->{code} eq '00DF' && | |
437 | $casespec->{lower} eq '00DF' && | |
438 | $casespec->{title} eq '0053 0073' && | |
439 | $casespec->{upper} eq '0053 0053' && | |
2d3cf3ee | 440 | !defined $casespec->{condition}, 'casespec 0xDF'); |
b08cd201 JH |
441 | |
442 | $casespec = casespec(0x307); | |
443 | ||
f499c386 | 444 | ok($casespec->{az}->{code} eq '0307' && |
2d3cf3ee | 445 | !defined $casespec->{az}->{lower} && |
f499c386 JH |
446 | $casespec->{az}->{title} eq '0307' && |
447 | $casespec->{az}->{upper} eq '0307' && | |
9c3dc587 | 448 | $casespec->{az}->{condition} eq 'az After_I', |
f5c9f3db | 449 | 'casespec 0x307'); |
6c8d78fb HS |
450 | |
451 | # perl #7305 UnicodeCD::compexcl is weird | |
452 | ||
2d3cf3ee | 453 | for (1) {my $a=compexcl $_} |
6c8d78fb | 454 | ok(1, 'compexcl read-only $_: perl #7305'); |
1f27373c | 455 | map {compexcl $_} %{{1=>2}}; |
6c8d78fb HS |
456 | ok(1, 'compexcl read-only hash: perl #7305'); |
457 | ||
d7829152 JH |
458 | is(Unicode::UCD::_getcode('123'), 123, "_getcode(123)"); |
459 | is(Unicode::UCD::_getcode('0123'), 0x123, "_getcode(0123)"); | |
460 | is(Unicode::UCD::_getcode('0x123'), 0x123, "_getcode(0x123)"); | |
461 | is(Unicode::UCD::_getcode('0X123'), 0x123, "_getcode(0X123)"); | |
462 | is(Unicode::UCD::_getcode('U+123'), 0x123, "_getcode(U+123)"); | |
463 | is(Unicode::UCD::_getcode('u+123'), 0x123, "_getcode(u+123)"); | |
464 | is(Unicode::UCD::_getcode('U+1234'), 0x1234, "_getcode(U+1234)"); | |
465 | is(Unicode::UCD::_getcode('U+12345'), 0x12345, "_getcode(U+12345)"); | |
466 | is(Unicode::UCD::_getcode('123x'), undef, "_getcode(123x)"); | |
467 | is(Unicode::UCD::_getcode('x123'), undef, "_getcode(x123)"); | |
468 | is(Unicode::UCD::_getcode('0x123x'), undef, "_getcode(x123)"); | |
469 | is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)"); | |
741297c1 JH |
470 | |
471 | { | |
472 | my $r1 = charscript('Latin'); | |
49ea58c8 | 473 | if (ok(defined $r1, "Found Latin script")) { |
f7ef59f7 KW |
474 | my $n1 = @$r1; |
475 | is($n1, 30, "number of ranges in Latin script (Unicode 6.1.0)"); | |
476 | shift @$r1 while @$r1; | |
477 | my $r2 = charscript('Latin'); | |
478 | is(@$r2, $n1, "modifying results should not mess up internal caches"); | |
49ea58c8 | 479 | } |
741297c1 JH |
480 | } |
481 | ||
c5a29f40 LM |
482 | { |
483 | is(charinfo(0xdeadbeef), undef, "[perl #23273] warnings in Unicode::UCD"); | |
2d3cf3ee | 484 | } |
a2bd7410 JH |
485 | |
486 | use Unicode::UCD qw(namedseq); | |
487 | ||
488 | is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq"); | |
489 | is(namedseq("KATAKANA LETTER AINU Q"), undef); | |
490 | is(namedseq(), undef); | |
491 | is(namedseq(qw(foo bar)), undef); | |
492 | my @ns = namedseq("KATAKANA LETTER AINU P"); | |
493 | is(scalar @ns, 2); | |
494 | is($ns[0], 0x31F7); | |
495 | is($ns[1], 0x309A); | |
496 | my %ns = namedseq(); | |
497 | is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}"); | |
498 | @ns = namedseq(42); | |
499 | is(@ns, 0); | |
500 | ||
7319f91d KW |
501 | use Unicode::UCD qw(num); |
502 | use charnames ":full"; | |
503 | ||
504 | is(num("0"), 0, 'Verify num("0") == 0'); | |
505 | is(num("98765"), 98765, 'Verify num("98765") == 98765'); | |
506 | ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined'); | |
c4b3d89d KW |
507 | is(num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}"), 21, 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}") == 21'); |
508 | ok(! defined num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}"), 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined'); | |
7319f91d KW |
509 | is(num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}"), 3, 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3'); |
510 | ok(! defined num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}"), 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined'); | |
511 | is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2'); | |
512 | is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000'); | |
513 | is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5'); | |
98025745 | 514 | is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9'); |
4f143a72 | 515 | is(num("\N{U+5146}"), 1000000000000, 'Verify num("\N{U+5146}") == 1000000000000'); |
eaebe4db | 516 | |
7ef25837 KW |
517 | # Create a user-defined property |
518 | sub InKana {<<'END'} | |
519 | 3040 309F | |
520 | 30A0 30FF | |
521 | END | |
522 | ||
523 | use Unicode::UCD qw(prop_aliases); | |
524 | ||
525 | is(prop_aliases(undef), undef, "prop_aliases(undef) returns <undef>"); | |
526 | is(prop_aliases("unknown property"), undef, | |
527 | "prop_aliases(<unknown property>) returns <undef>"); | |
528 | is(prop_aliases("InKana"), undef, | |
529 | "prop_aliases(<user-defined property>) returns <undef>"); | |
530 | is(prop_aliases("Perl_Decomposition_Mapping"), undef, "prop_aliases('Perl_Decomposition_Mapping') returns <undef> since internal-Perl-only"); | |
531 | is(prop_aliases("Perl_Charnames"), undef, | |
532 | "prop_aliases('Perl_Charnames') returns <undef> since internal-Perl-only"); | |
533 | is(prop_aliases("isgc"), undef, | |
534 | "prop_aliases('isgc') returns <undef> since is not covered Perl extension"); | |
535 | is(prop_aliases("Is_Is_Any"), undef, | |
536 | "prop_aliases('Is_Is_Any') returns <undef> since two is's"); | |
537 | ||
538 | require 'utf8_heavy.pl'; | |
539 | require "unicore/Heavy.pl"; | |
540 | ||
541 | # Keys are lists of properties. Values are defined if have been tested. | |
542 | my %props; | |
543 | ||
544 | # To test for loose matching, add in the characters that are ignored there. | |
545 | my $extra_chars = "-_ "; | |
546 | ||
547 | # The one internal property we accept | |
548 | $props{'Perl_Decimal_Digit'} = 1; | |
549 | my @list = prop_aliases("perldecimaldigit"); | |
550 | is_deeply(\@list, | |
551 | [ "Perl_Decimal_Digit", | |
552 | "Perl_Decimal_Digit" | |
553 | ], "prop_aliases('perldecimaldigit') returns Perl_Decimal_Digit as both short and full names"); | |
554 | ||
555 | # Get the official Unicode property name synonyms and test them. | |
0a2ae3b5 KW |
556 | |
557 | SKIP: { | |
558 | skip "PropertyAliases.txt is not in this Unicode version", 1 if $v_unicode_version lt v3.2.0; | |
7ef25837 KW |
559 | open my $props, "<", "../lib/unicore/PropertyAliases.txt" |
560 | or die "Can't open Unicode PropertyAliases.txt"; | |
6bbe4a24 | 561 | local $/ = "\n"; |
7ef25837 KW |
562 | while (<$props>) { |
563 | s/\s*#.*//; # Remove comments | |
564 | next if /^\s* $/x; # Ignore empty and comment lines | |
565 | ||
566 | chomp; | |
6bbe4a24 | 567 | local $/ = $input_record_separator; |
7ef25837 KW |
568 | my $count = 0; # 0th field in line is short name; 1th is long name |
569 | my $short_name; | |
570 | my $full_name; | |
571 | my @names_via_short; | |
572 | foreach my $alias (split /\s*;\s*/) { # Fields are separated by | |
573 | # semi-colons | |
574 | # Add in the characters that are supposed to be ignored, to test loose | |
575 | # matching, which the tested function does on all inputs. | |
576 | my $mod_name = "$extra_chars$alias"; | |
577 | ||
e72b6605 | 578 | my $loose = &utf8::_loose_name(lc $alias); |
7ef25837 KW |
579 | |
580 | # Indicate we have tested this. | |
581 | $props{$loose} = 1; | |
582 | ||
583 | my @all_names = prop_aliases($mod_name); | |
584 | if (grep { $_ eq $loose } @Unicode::UCD::suppressed_properties) { | |
585 | is(@all_names, 0, "prop_aliases('$mod_name') returns undef since $alias is not installed"); | |
586 | next; | |
587 | } | |
588 | elsif (! @all_names) { | |
589 | fail("prop_aliases('$mod_name')"); | |
590 | diag("'$alias' is unknown to prop_aliases()"); | |
591 | next; | |
592 | } | |
593 | ||
594 | if ($count == 0) { # Is short name | |
595 | ||
596 | @names_via_short = prop_aliases($mod_name); | |
597 | ||
598 | # If the 0th test fails, no sense in continuing with the others | |
599 | last unless is($names_via_short[0], $alias, | |
600 | "prop_aliases: '$alias' is the short name for '$mod_name'"); | |
601 | $short_name = $alias; | |
602 | } | |
603 | elsif ($count == 1) { # Is full name | |
604 | ||
605 | # Some properties have the same short and full name; no sense | |
606 | # repeating the test if the same. | |
607 | if ($alias ne $short_name) { | |
608 | my @names_via_full = prop_aliases($mod_name); | |
609 | is_deeply(\@names_via_full, \@names_via_short, "prop_aliases() returns the same list for both '$short_name' and '$mod_name'"); | |
610 | } | |
611 | ||
612 | # Tests scalar context | |
613 | is(prop_aliases($short_name), $alias, | |
614 | "prop_aliases: '$alias' is the long name for '$short_name'"); | |
615 | } | |
616 | else { # Is another alias | |
617 | is_deeply(\@all_names, \@names_via_short, "prop_aliases() returns the same list for both '$short_name' and '$mod_name'"); | |
618 | ok((grep { $_ =~ /^$alias$/i } @all_names), | |
619 | "prop_aliases: '$alias' is listed as an alias for '$mod_name'"); | |
620 | } | |
621 | ||
622 | $count++; | |
623 | } | |
624 | } | |
0a2ae3b5 | 625 | } # End of SKIP block |
7ef25837 KW |
626 | |
627 | # Now test anything we can find that wasn't covered by the tests of the | |
628 | # official properties. We have no way of knowing if mktables omitted a Perl | |
629 | # extension or not, but we do the best we can from its generated lists | |
630 | ||
631 | foreach my $alias (keys %utf8::loose_to_file_of) { | |
632 | next if $alias =~ /=/; | |
633 | my $lc_name = lc $alias; | |
e72b6605 | 634 | my $loose = &utf8::_loose_name($lc_name); |
7ef25837 KW |
635 | next if exists $props{$loose}; # Skip if already tested |
636 | $props{$loose} = 1; | |
637 | my $mod_name = "$extra_chars$alias"; # Tests loose matching | |
638 | my @aliases = prop_aliases($mod_name); | |
e72b6605 | 639 | my $found_it = grep { &utf8::_loose_name(lc $_) eq $lc_name } @aliases; |
7ef25837 KW |
640 | if ($found_it) { |
641 | pass("prop_aliases: '$lc_name' is listed as an alias for '$mod_name'"); | |
642 | } | |
643 | elsif ($lc_name =~ /l[_&]$/) { | |
644 | ||
645 | # These two names are special in that they don't appear in the | |
646 | # returned list because they are discouraged from use. Verify | |
647 | # that they return the same list as a non-discouraged version. | |
648 | my @LC = prop_aliases('Is_LC'); | |
649 | is_deeply(\@aliases, \@LC, "prop_aliases: '$lc_name' returns the same list as 'Is_LC'"); | |
650 | } | |
651 | else { | |
652 | my $stripped = $lc_name =~ s/^is//; | |
653 | ||
654 | # Could be that the input includes a prefix 'is', which is rarely | |
655 | # returned as an alias, so having successfully stripped it off above, | |
656 | # try again. | |
657 | if ($stripped) { | |
e72b6605 | 658 | $found_it = grep { &utf8::_loose_name(lc $_) eq $lc_name } @aliases; |
7ef25837 KW |
659 | } |
660 | ||
661 | # If that didn't work, it could be that it's a block, which is always | |
662 | # returned with a leading 'In_' to avoid ambiguity. Try comparing | |
663 | # with that stripped off. | |
664 | if (! $found_it) { | |
e72b6605 | 665 | $found_it = grep { &utf8::_loose_name(s/^In_(.*)/\L$1/r) eq $lc_name } |
7ef25837 KW |
666 | @aliases; |
667 | # Could check that is a real block, but tests for invmap will | |
668 | # likely pickup any errors, since this will be tested there. | |
669 | $lc_name = "in$lc_name" if $found_it; # Change for message below | |
670 | } | |
671 | my $message = "prop_aliases: '$lc_name' is listed as an alias for '$mod_name'"; | |
672 | ($found_it) ? pass($message) : fail($message); | |
673 | } | |
674 | } | |
675 | ||
676 | my $done_equals = 0; | |
677 | foreach my $alias (keys %utf8::stricter_to_file_of) { | |
678 | if ($alias =~ /=/) { # Only test one case where there is an equals | |
679 | next if $done_equals; | |
680 | $done_equals = 1; | |
681 | } | |
682 | my $lc_name = lc $alias; | |
683 | my @list = prop_aliases($alias); | |
684 | if ($alias =~ /^_/) { | |
685 | is(@list, 0, "prop_aliases: '$lc_name' returns an empty list since it is internal_only"); | |
686 | } | |
687 | elsif ($alias =~ /=/) { | |
688 | is(@list, 0, "prop_aliases: '$lc_name' returns an empty list since is illegal property name"); | |
689 | } | |
690 | else { | |
691 | ok((grep { lc $_ eq $lc_name } @list), | |
692 | "prop_aliases: '$lc_name' is listed as an alias for '$alias'"); | |
693 | } | |
694 | } | |
695 | ||
696 | use Unicode::UCD qw(prop_value_aliases); | |
697 | ||
698 | is(prop_value_aliases("unknown property", "unknown value"), undef, | |
699 | "prop_value_aliases(<unknown property>, <unknown value>) returns <undef>"); | |
700 | is(prop_value_aliases(undef, undef), undef, | |
701 | "prop_value_aliases(undef, undef) returns <undef>"); | |
702 | is((prop_value_aliases("na", "A")), "A", "test that prop_value_aliases returns its input for properties that don't have synonyms"); | |
703 | is(prop_value_aliases("isgc", "C"), undef, "prop_value_aliases('isgc', 'C') returns <undef> since is not covered Perl extension"); | |
704 | is(prop_value_aliases("gc", "isC"), undef, "prop_value_aliases('gc', 'isC') returns <undef> since is not covered Perl extension"); | |
705 | ||
706 | # We have no way of knowing if mktables omitted a Perl extension that it | |
707 | # shouldn't have, but we can check if it omitted an official Unicode property | |
708 | # name synonym. And for those, we can check if the short and full names are | |
709 | # correct. | |
710 | ||
711 | my %pva_tested; # List of things already tested. | |
61b174f6 KW |
712 | |
713 | SKIP: { | |
714 | skip "PropValueAliases.txt is not in this Unicode version", 1 if $v_unicode_version lt v3.2.0; | |
7ef25837 KW |
715 | open my $propvalues, "<", "../lib/unicore/PropValueAliases.txt" |
716 | or die "Can't open Unicode PropValueAliases.txt"; | |
6bbe4a24 | 717 | local $/ = "\n"; |
7ef25837 KW |
718 | while (<$propvalues>) { |
719 | s/\s*#.*//; # Remove comments | |
720 | next if /^\s* $/x; # Ignore empty and comment lines | |
721 | chomp; | |
6bbe4a24 | 722 | local $/ = $input_record_separator; |
7ef25837 | 723 | |
74090492 | 724 | # Fix typo in official input file |
863a4fdf | 725 | s/CCC133/CCC132/g if $v_unicode_version eq v6.1.0; |
74090492 | 726 | |
7ef25837 KW |
727 | my @fields = split /\s*;\s*/; # Fields are separated by semi-colons |
728 | my $prop = shift @fields; # 0th field is the property, | |
729 | my $count = 0; # 0th field in line (after shifting off the property) is | |
730 | # short name; 1th is long name | |
731 | my $short_name; | |
732 | my @names_via_short; # Saves the values between iterations | |
733 | ||
734 | # The property on the lhs of the = is always loosely matched. Add in | |
735 | # characters that are ignored under loose matching to test that | |
736 | my $mod_prop = "$extra_chars$prop"; | |
737 | ||
738 | if ($fields[0] eq 'n/a') { # See comments in input file, essentially | |
739 | # means full name and short name are identical | |
740 | $fields[0] = $fields[1]; | |
741 | } | |
742 | elsif ($fields[0] ne $fields[1] | |
e72b6605 KW |
743 | && &utf8::_loose_name(lc $fields[0]) |
744 | eq &utf8::_loose_name(lc $fields[1]) | |
7ef25837 KW |
745 | && $fields[1] !~ /[[:upper:]]/) |
746 | { | |
747 | # Also, there is a bug in the file in which "n/a" is omitted, and | |
748 | # the two fields are identical except for case, and the full name | |
749 | # is all lower case. Copy the "short" name unto the full one to | |
750 | # give it some upper case. | |
751 | ||
752 | $fields[1] = $fields[0]; | |
753 | } | |
754 | ||
755 | # The ccc property in the file is special; has an extra numeric field | |
756 | # (0th), which should go at the end, since we use the next two fields as | |
757 | # the short and full names, respectively. See comments in input file. | |
758 | splice (@fields, 0, 0, splice(@fields, 1, 2)) if $prop eq 'ccc'; | |
759 | ||
e72b6605 | 760 | my $loose_prop = &utf8::_loose_name(lc $prop); |
7ef25837 KW |
761 | my $suppressed = grep { $_ eq $loose_prop } |
762 | @Unicode::UCD::suppressed_properties; | |
763 | foreach my $value (@fields) { | |
764 | if ($suppressed) { | |
765 | is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop"); | |
766 | next; | |
767 | } | |
e72b6605 | 768 | elsif (grep { $_ eq ("$loose_prop=" . &utf8::_loose_name(lc $value)) } @Unicode::UCD::suppressed_properties) { |
7ef25837 KW |
769 | is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop=$value"); |
770 | next; | |
771 | } | |
772 | ||
773 | # Add in test for loose matching. | |
774 | my $mod_value = "$extra_chars$value"; | |
775 | ||
776 | # If the value is a number, optionally negative, including a floating | |
777 | # point or rational numer, it should be only strictly matched, so the | |
778 | # loose matching should fail. | |
779 | if ($value =~ / ^ -? \d+ (?: [\/.] \d+ )? $ /x) { | |
780 | is(prop_value_aliases($mod_prop, $mod_value), undef, "prop_value_aliases('$mod_prop', '$mod_value') returns undef because '$mod_value' should be strictly matched"); | |
781 | ||
782 | # And reset so below tests just the strict matching. | |
783 | $mod_value = $value; | |
784 | } | |
785 | ||
786 | if ($count == 0) { | |
787 | ||
788 | @names_via_short = prop_value_aliases($mod_prop, $mod_value); | |
789 | ||
790 | # If the 0th test fails, no sense in continuing with the others | |
791 | last unless is($names_via_short[0], $value, "prop_value_aliases: In '$prop', '$value' is the short name for '$mod_value'"); | |
792 | $short_name = $value; | |
793 | } | |
794 | elsif ($count == 1) { | |
795 | ||
796 | # Some properties have the same short and full name; no sense | |
797 | # repeating the test if the same. | |
798 | if ($value ne $short_name) { | |
799 | my @names_via_full = | |
800 | prop_value_aliases($mod_prop, $mod_value); | |
801 | is_deeply(\@names_via_full, \@names_via_short, "In '$prop', prop_value_aliases() returns the same list for both '$short_name' and '$mod_value'"); | |
802 | } | |
803 | ||
804 | # Tests scalar context | |
805 | is(prop_value_aliases($prop, $short_name), $value, "'$value' is the long name for prop_value_aliases('$prop', '$short_name')"); | |
806 | } | |
807 | else { | |
808 | my @all_names = prop_value_aliases($mod_prop, $mod_value); | |
809 | is_deeply(\@all_names, \@names_via_short, "In '$prop', prop_value_aliases() returns the same list for both '$short_name' and '$mod_value'"); | |
e72b6605 | 810 | ok((grep { &utf8::_loose_name(lc $_) eq &utf8::_loose_name(lc $value) } prop_value_aliases($prop, $short_name)), "'$value' is listed as an alias for prop_value_aliases('$prop', '$short_name')"); |
7ef25837 KW |
811 | } |
812 | ||
e72b6605 | 813 | $pva_tested{&utf8::_loose_name(lc $prop) . "=" . &utf8::_loose_name(lc $value)} = 1; |
7ef25837 KW |
814 | $count++; |
815 | } | |
816 | } | |
61b174f6 | 817 | } # End of SKIP block |
7ef25837 KW |
818 | |
819 | # And test as best we can, the non-official pva's that mktables generates. | |
820 | foreach my $hash (\%utf8::loose_to_file_of, \%utf8::stricter_to_file_of) { | |
821 | foreach my $test (keys %$hash) { | |
822 | next if exists $pva_tested{$test}; # Skip if already tested | |
823 | ||
824 | my ($prop, $value) = split "=", $test; | |
825 | next unless defined $value; # prop_value_aliases() requires an input | |
826 | # 'value' | |
827 | my $mod_value; | |
828 | if ($hash == \%utf8::loose_to_file_of) { | |
829 | ||
830 | # Add extra characters to test loose-match rhs value | |
831 | $mod_value = "$extra_chars$value"; | |
832 | } | |
833 | else { # Here value is strictly matched. | |
834 | ||
835 | # Extra elements are added by mktables to this hash so that | |
836 | # something like "age=6.0" has a synonym of "age=6". It's not | |
837 | # clear to me (khw) if we should be encouraging those synonyms, so | |
838 | # don't test for them. | |
839 | next if $value !~ /\D/ && exists $hash->{"$prop=$value.0"}; | |
840 | ||
841 | # Verify that loose matching fails when only strict is called for. | |
842 | next unless is(prop_value_aliases($prop, "$extra_chars$value"), undef, | |
843 | "prop_value_aliases('$prop', '$extra_chars$value') returns undef since '$value' should be strictly matched"), | |
844 | ||
845 | # Strict matching does allow for underscores between digits. Test | |
846 | # for that. | |
847 | $mod_value = $value; | |
848 | while ($mod_value =~ s/(\d)(\d)/$1_$2/g) {} | |
849 | } | |
850 | ||
851 | # The lhs property is always loosely matched, so add in extra | |
852 | # characters to test that. | |
853 | my $mod_prop = "$extra_chars$prop"; | |
854 | ||
855 | if ($prop eq 'gc' && $value =~ /l[_&]$/) { | |
856 | # These two names are special in that they don't appear in the | |
857 | # returned list because they are discouraged from use. Verify | |
858 | # that they return the same list as a non-discouraged version. | |
859 | my @LC = prop_value_aliases('gc', 'lc'); | |
860 | my @l_ = prop_value_aliases($mod_prop, $mod_value); | |
861 | is_deeply(\@l_, \@LC, "prop_value_aliases('$mod_prop', '$mod_value) returns the same list as prop_value_aliases('gc', 'lc')"); | |
862 | } | |
863 | else { | |
e72b6605 | 864 | ok((grep { &utf8::_loose_name(lc $_) eq &utf8::_loose_name(lc $value) } |
7ef25837 KW |
865 | prop_value_aliases($mod_prop, $mod_value)), |
866 | "'$value' is listed as an alias for prop_value_aliases('$mod_prop', '$mod_value')"); | |
867 | } | |
868 | } | |
869 | } | |
870 | ||
871 | undef %pva_tested; | |
872 | ||
681d705c KW |
873 | no warnings 'once'; # We use some values once from 'required' modules. |
874 | ||
62b3b855 KW |
875 | use Unicode::UCD qw(prop_invlist prop_invmap MAX_CP); |
876 | ||
877 | # There were some problems with caching interfering with prop_invlist() vs | |
878 | # prop_invmap() on binary properties, and also between the 3 properties where | |
879 | # Perl used the same 'To' name as another property (see utf8_heavy.pl). | |
880 | # So, before testing all of prop_invlist(), | |
881 | # 1) call prop_invmap() to try both orders of these name issues. This uses | |
882 | # up two of the 3 properties; the third will be left so that invlist() | |
883 | # on it gets called before invmap() | |
884 | # 2) call prop_invmap() on a generic binary property, ahead of invlist(). | |
885 | # This should test that the caching works in both directions. | |
886 | ||
887 | # These properties are not stable between Unicode versions, but the first few | |
888 | # elements are; just look at the first element to see if are getting the | |
889 | # distinction right. The general inversion map testing below will test the | |
890 | # whole thing. | |
891 | my $prop = "uc"; | |
892 | my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); | |
d11155ec | 893 | is($format, 'al', "prop_invmap() format of '$prop' is 'al'"); |
bf7fe2df | 894 | is($missing, '0', "prop_invmap() missing of '$prop' is '0'"); |
62b3b855 | 895 | is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61"); |
d11155ec | 896 | is($invmap_ref->[1], 0x41, "prop_invmap('$prop') map[1] is 0x41"); |
62b3b855 KW |
897 | |
898 | $prop = "upper"; | |
899 | ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); | |
d11155ec KW |
900 | is($format, 's', "prop_invmap() format of '$prop' is 's"); |
901 | is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); | |
62b3b855 KW |
902 | is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41"); |
903 | is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'"); | |
904 | ||
905 | $prop = "lower"; | |
906 | ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); | |
d11155ec KW |
907 | is($format, 's', "prop_invmap() format of '$prop' is 's'"); |
908 | is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); | |
62b3b855 KW |
909 | is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61"); |
910 | is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'"); | |
911 | ||
912 | $prop = "lc"; | |
913 | ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); | |
d11155ec | 914 | is($format, 'al', "prop_invmap() format of '$prop' is 'al'"); |
bf7fe2df | 915 | is($missing, '0', "prop_invmap() missing of '$prop' is '0'"); |
62b3b855 | 916 | is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41"); |
d11155ec | 917 | is($invmap_ref->[1], 0x61, "prop_invmap('$prop') map[1] is 0x61"); |
62b3b855 KW |
918 | |
919 | # This property is stable and small, so can test all of it | |
920 | $prop = "ASCII_Hex_Digit"; | |
921 | ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); | |
922 | is($format, 's', "prop_invmap() format of '$prop' is 's'"); | |
923 | is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); | |
924 | is_deeply($invlist_ref, [ 0x0000, 0x0030, 0x003A, 0x0041, | |
925 | 0x0047, 0x0061, 0x0067, 0x110000 ], | |
926 | "prop_invmap('$prop') code point list is correct"); | |
927 | is_deeply($invmap_ref, [ 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N' ] , | |
928 | "prop_invmap('$prop') map list is correct"); | |
681d705c KW |
929 | |
930 | is(prop_invlist("Unknown property"), undef, "prop_invlist(<Unknown property>) returns undef"); | |
931 | is(prop_invlist(undef), undef, "prop_invlist(undef) returns undef"); | |
932 | is(prop_invlist("Any"), 2, "prop_invlist('Any') returns the number of elements in scalar context"); | |
933 | my @invlist = prop_invlist("Is_Any"); | |
934 | is_deeply(\@invlist, [ 0, 0x110000 ], "prop_invlist works on 'Is_' prefixes"); | |
935 | is(prop_invlist("Is_Is_Any"), undef, "prop_invlist('Is_Is_Any') returns <undef> since two is's"); | |
936 | ||
937 | use Storable qw(dclone); | |
938 | ||
939 | is(prop_invlist("InKana"), undef, "prop_invlist(<user-defined property returns undef>)"); | |
940 | ||
62b3b855 | 941 | # The way both the tests for invlist and invmap work is that they take the |
681d705c KW |
942 | # lists returned by the functions and construct from them what the original |
943 | # file should look like, which are then compared with the file. If they are | |
944 | # identical, the test passes. What this tests isn't that the results are | |
62b3b855 | 945 | # correct, but that invlist and invmap haven't introduced errors beyond what |
681d705c KW |
946 | # are there in the files. As a small hedge against that, test some |
947 | # prop_invlist() tables fully with the known correct result. We choose | |
948 | # ASCII_Hex_Digit again, as it is stable. | |
949 | @invlist = prop_invlist("AHex"); | |
950 | is_deeply(\@invlist, [ 0x0030, 0x003A, 0x0041, | |
951 | 0x0047, 0x0061, 0x0067 ], | |
952 | "prop_invlist('AHex') is exactly the expected set of points"); | |
953 | @invlist = prop_invlist("AHex=f"); | |
954 | is_deeply(\@invlist, [ 0x0000, 0x0030, 0x003A, 0x0041, | |
955 | 0x0047, 0x0061, 0x0067 ], | |
956 | "prop_invlist('AHex=f') is exactly the expected set of points"); | |
957 | ||
958 | sub fail_with_diff ($$$$) { | |
959 | # For use below to output better messages | |
960 | my ($prop, $official, $constructed, $tested_function_name) = @_; | |
961 | ||
962 | is($constructed, $official, "$tested_function_name('$prop')"); | |
963 | diag("Comment out lines " . (__LINE__ - 1) . " through " . (__LINE__ + 1) . " in '$0' on Un*x-like systems to see just the differences. Uses the 'diff' first in your \$PATH"); | |
964 | return; | |
965 | ||
966 | fail("$tested_function_name('$prop')"); | |
967 | ||
968 | require File::Temp; | |
969 | my $off = File::Temp->new(); | |
6bbe4a24 | 970 | local $/ = "\n"; |
681d705c KW |
971 | chomp $official; |
972 | print $off $official, "\n"; | |
973 | close $off || die "Can't close official"; | |
974 | ||
975 | chomp $constructed; | |
976 | my $gend = File::Temp->new(); | |
977 | print $gend $constructed, "\n"; | |
978 | close $gend || die "Can't close gend"; | |
979 | ||
980 | my $diff = File::Temp->new(); | |
981 | system("diff $off $gend > $diff"); | |
982 | ||
983 | open my $fh, "<", $diff || die "Can't open $diff"; | |
984 | my @diffs = <$fh>; | |
985 | diag("In the diff output below '<' marks lines from the filesystem tables;\n'>' are from $tested_function_name()"); | |
986 | diag(@diffs); | |
987 | } | |
988 | ||
989 | my %tested_invlist; | |
990 | ||
991 | # Look at everything we think that mktables tells us exists, both loose and | |
992 | # strict | |
993 | foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of) | |
994 | { | |
995 | foreach my $table (keys %$set_of_tables) { | |
996 | ||
997 | my $mod_table; | |
998 | my ($prop_only, $value) = split "=", $table; | |
999 | if (defined $value) { | |
1000 | ||
1001 | # If this is to be loose matched, add in characters to test that. | |
1002 | if ($set_of_tables == \%utf8::loose_to_file_of) { | |
1003 | $value = "$extra_chars$value"; | |
1004 | } | |
1005 | else { # Strict match | |
1006 | ||
1007 | # Verify that loose matching fails when only strict is called | |
1008 | # for. | |
1009 | next unless is(prop_invlist("$prop_only=$extra_chars$value"), undef, "prop_invlist('$prop_only=$extra_chars$value') returns undef since should be strictly matched"); | |
1010 | ||
1011 | # Strict matching does allow for underscores between digits. | |
1012 | # Test for that. | |
1013 | while ($value =~ s/(\d)(\d)/$1_$2/g) {} | |
1014 | } | |
1015 | ||
1016 | # The property portion in compound form specifications always | |
1017 | # matches loosely | |
1018 | $mod_table = "$extra_chars$prop_only = $value"; | |
1019 | } | |
1020 | else { # Single-form. | |
1021 | ||
1022 | # Like above, use looose if required, and insert underscores | |
1023 | # between digits if strict. | |
1024 | if ($set_of_tables == \%utf8::loose_to_file_of) { | |
1025 | $mod_table = "$extra_chars$table"; | |
1026 | } | |
1027 | else { | |
1028 | $mod_table = $table; | |
1029 | while ($mod_table =~ s/(\d)(\d)/$1_$2/g) {} | |
1030 | } | |
1031 | } | |
1032 | ||
1033 | my @tested = prop_invlist($mod_table); | |
1034 | if ($table =~ /^_/) { | |
1035 | is(@tested, 0, "prop_invlist('$mod_table') returns an empty list since is internal-only"); | |
1036 | next; | |
1037 | } | |
1038 | ||
1039 | # If we have already tested a property that uses the same file, this | |
1040 | # list should be identical to the one that was tested, and can bypass | |
1041 | # everything else. | |
1042 | my $file = $set_of_tables->{$table}; | |
1043 | if (exists $tested_invlist{$file}) { | |
1044 | is_deeply(\@tested, $tested_invlist{$file}, "prop_invlist('$mod_table') gave same results as its name synonym"); | |
1045 | next; | |
1046 | } | |
1047 | $tested_invlist{$file} = dclone \@tested; | |
1048 | ||
1049 | # A leading '!' in the file name means that it is to be inverted. | |
1050 | my $invert = $file =~ s/^!//; | |
1051 | my $official = do "unicore/lib/$file.pl"; | |
1052 | ||
1053 | # Get rid of any trailing space and comments in the file. | |
1054 | $official =~ s/\s*(#.*)?$//mg; | |
6bbe4a24 | 1055 | local $/ = "\n"; |
681d705c | 1056 | chomp $official; |
6bbe4a24 | 1057 | $/ = $input_record_separator; |
681d705c KW |
1058 | |
1059 | # If we are to test against an inverted file, it is easier to invert | |
1060 | # our array than the file. | |
1061 | # The file only is valid for Unicode code points, while the inversion | |
1062 | # list is valid for all possible code points. Therefore, we must test | |
1063 | # just the Unicode part against the file. Later we will test for | |
1064 | # the non-Unicode part. | |
1065 | ||
1066 | my $before_invert; # Saves the pre-inverted table. | |
1067 | if ($invert) { | |
1068 | $before_invert = dclone \@tested; | |
1069 | if (@tested && $tested[0] == 0) { | |
1070 | shift @tested; | |
1071 | } else { | |
1072 | unshift @tested, 0; | |
1073 | } | |
1074 | if (@tested && $tested[-1] == 0x110000) { | |
1075 | pop @tested; | |
1076 | } | |
1077 | else { | |
1078 | push @tested, 0x110000; | |
1079 | } | |
1080 | } | |
1081 | ||
1082 | # Now construct a string from the list that should match the file. | |
1083 | # The file gives ranges of code points with starting and ending values | |
1084 | # in hex, like this: | |
1085 | # 0041\t005A | |
1086 | # 0061\t007A | |
1087 | # 00AA | |
1088 | # Our list has even numbered elements start ranges that are in the | |
1089 | # list, and odd ones that aren't in the list. Therefore the odd | |
1090 | # numbered ones are one beyond the end of the previous range, but | |
1091 | # otherwise don't get reflected in the file. | |
1092 | my $tested = ""; | |
1093 | my $i = 0; | |
1094 | for (; $i < @tested - 1; $i += 2) { | |
1095 | my $start = $tested[$i]; | |
1096 | my $end = $tested[$i+1] - 1; | |
1097 | if ($start == $end) { | |
1098 | $tested .= sprintf("%04X\n", $start); | |
1099 | } | |
1100 | else { | |
1101 | $tested .= sprintf "%04X\t%04X\n", $start, $end; | |
1102 | } | |
1103 | } | |
1104 | ||
1105 | # As mentioned earlier, the disk files only go up through Unicode, | |
1106 | # whereas the prop_invlist() ones go as high as necessary. The | |
1107 | # comparison is only valid through max Unicode. | |
1108 | if ($i == @tested - 1 && $tested[$i] <= 0x10FFFF) { | |
1109 | $tested .= sprintf("%04X\t10FFFF\n", $tested[$i]); | |
1110 | } | |
6bbe4a24 | 1111 | local $/ = "\n"; |
681d705c | 1112 | chomp $tested; |
6bbe4a24 | 1113 | $/ = $input_record_separator; |
681d705c KW |
1114 | if ($tested ne $official) { |
1115 | fail_with_diff($mod_table, $official, $tested, "prop_invlist"); | |
1116 | next; | |
1117 | } | |
1118 | ||
1119 | # Here, it matched the table. Now need to check for if it is correct | |
1120 | # for beyond Unicode. First, calculate if is the default table or | |
1121 | # not. This is the same algorithm as used internally in | |
1122 | # prop_invlist(), so if it is wrong there, this test won't catch it. | |
1123 | my $prop = lc $table; | |
1124 | ($prop_only, $table) = split /\s*[:=]\s*/, $prop; | |
1125 | if (defined $table) { | |
1126 | ||
1127 | # May have optional prefixed 'is' | |
e72b6605 | 1128 | $prop = &utf8::_loose_name($prop_only) =~ s/^is//r; |
681d705c | 1129 | $prop = $utf8::loose_property_name_of{$prop}; |
e72b6605 | 1130 | $prop .= "=" . &utf8::_loose_name($table); |
681d705c KW |
1131 | } |
1132 | else { | |
e72b6605 | 1133 | $prop = &utf8::_loose_name($prop); |
681d705c KW |
1134 | } |
1135 | my $is_default = exists $Unicode::UCD::loose_defaults{$prop}; | |
1136 | ||
1137 | @tested = @$before_invert if $invert; # Use the original | |
1138 | if (@tested % 2 == 0) { | |
1139 | ||
1140 | # If there are an even number of elements, the final one starts a | |
1141 | # range (going to infinity) of code points that are not in the | |
1142 | # list. | |
1143 | if ($is_default) { | |
1144 | fail("prop_invlist('$mod_table')"); | |
1145 | diag("default table doesn't goto infinity"); | |
1146 | use Data::Dumper; | |
1147 | diag Dumper \@tested; | |
1148 | next; | |
1149 | } | |
1150 | } | |
1151 | else { | |
1152 | # An odd number of elements means the final one starts a range | |
1153 | # (going to infinity of code points that are in the list. | |
1154 | if (! $is_default) { | |
1155 | fail("prop_invlist('$mod_table')"); | |
1156 | diag("non-default table needs to stop in the Unicode range"); | |
1157 | use Data::Dumper; | |
1158 | diag Dumper \@tested; | |
1159 | next; | |
1160 | } | |
1161 | } | |
1162 | ||
1163 | pass("prop_invlist('$mod_table')"); | |
1164 | } | |
1165 | } | |
1166 | ||
62b3b855 KW |
1167 | # Now test prop_invmap(). |
1168 | ||
1169 | @list = prop_invmap("Unknown property"); | |
1170 | is (@list, 0, "prop_invmap(<Unknown property>) returns an empty list"); | |
1171 | @list = prop_invmap(undef); | |
1172 | is (@list, 0, "prop_invmap(undef) returns an empty list"); | |
1173 | ok (! eval "prop_invmap('gc')" && $@ ne "", | |
1174 | "prop_invmap('gc') dies in scalar context"); | |
1175 | @list = prop_invmap("_X_Begin"); | |
1176 | is (@list, 0, "prop_invmap(<internal property>) returns an empty list"); | |
1177 | @list = prop_invmap("InKana"); | |
1178 | is(@list, 0, "prop_invmap(<user-defined property returns undef>)"); | |
1179 | @list = prop_invmap("Perl_Decomposition_Mapping"), undef, | |
1180 | is(@list, 0, "prop_invmap('Perl_Decomposition_Mapping') returns <undef> since internal-Perl-only"); | |
1181 | @list = prop_invmap("Perl_Charnames"), undef, | |
1182 | is(@list, 0, "prop_invmap('Perl_Charnames') returns <undef> since internal-Perl-only"); | |
1183 | @list = prop_invmap("Is_Is_Any"); | |
1184 | is(@list, 0, "prop_invmap('Is_Is_Any') returns <undef> since two is's"); | |
1185 | ||
1186 | # The set of properties to test on has already been compiled into %props by | |
1187 | # the prop_aliases() tests. | |
1188 | ||
1189 | my %tested_invmaps; | |
1190 | ||
1191 | # Like prop_invlist(), prop_invmap() is tested by comparing the results | |
1192 | # returned by the function with the tables that mktables generates. Some of | |
1193 | # these tables are directly stored as files on disk, in either the unicore or | |
1194 | # unicore/To directories, and most should be listed in the mktables generated | |
1195 | # hash %utf8::loose_property_to_file_of, with a few additional ones that this | |
1196 | # handles specially. For these, the files are read in directly, massaged, and | |
1197 | # compared with what invmap() returns. The SPECIALS hash in some of these | |
1198 | # files overrides values in the main part of the file. | |
1199 | # | |
1200 | # The other properties are tested indirectly by generating all the possible | |
1201 | # inversion lists for the property, and seeing if those match the inversion | |
1202 | # lists returned by prop_invlist(), which has already been tested. | |
1203 | ||
1204 | PROPERTY: | |
1205 | foreach my $prop (keys %props) { | |
e72b6605 | 1206 | my $loose_prop = &utf8::_loose_name(lc $prop); |
62b3b855 KW |
1207 | my $suppressed = grep { $_ eq $loose_prop } |
1208 | @Unicode::UCD::suppressed_properties; | |
1209 | ||
1210 | # Find the short and full names that this property goes by | |
1211 | my ($name, $full_name) = prop_aliases($prop); | |
1212 | if (! $name) { | |
1213 | if (! $suppressed) { | |
1214 | fail("prop_invmap('$prop')"); | |
1215 | diag("is unknown to prop_aliases(), and we need it in order to test prop_invmap"); | |
1216 | } | |
1217 | next PROPERTY; | |
1218 | } | |
1219 | ||
1220 | # Normalize the short name, as it is stored in the hashes under the | |
1221 | # normalized version. | |
e72b6605 | 1222 | $name = &utf8::_loose_name(lc $name); |
62b3b855 KW |
1223 | |
1224 | # Add in the characters that are supposed to be ignored to test loose | |
1225 | # matching, which the tested function applies to all properties | |
1226 | my $mod_prop = "$extra_chars$prop"; | |
1227 | ||
1228 | my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($mod_prop); | |
1229 | my $return_ref = [ $invlist_ref, $invmap_ref, $format, $missing ]; | |
1230 | ||
1231 | # If have already tested this property under a different name, merely | |
1232 | # compare the return from now with the saved one from before. | |
1233 | if (exists $tested_invmaps{$name}) { | |
1234 | is_deeply($return_ref, $tested_invmaps{$name}, "prop_invmap('$mod_prop') gave same results as its synonym, '$name'"); | |
1235 | next PROPERTY; | |
1236 | } | |
1237 | $tested_invmaps{$name} = dclone $return_ref; | |
1238 | ||
1239 | # If prop_invmap() returned nothing, is ok iff is a property whose file is | |
1240 | # not generated. | |
1241 | if ($suppressed) { | |
1242 | if (defined $format) { | |
1243 | fail("prop_invmap('$mod_prop')"); | |
1244 | diag("did not return undef for suppressed property $prop"); | |
1245 | } | |
1246 | next PROPERTY; | |
1247 | } | |
1248 | elsif (!defined $format) { | |
1249 | fail("prop_invmap('$mod_prop')"); | |
1250 | diag("'$prop' is unknown to prop_invmap()"); | |
1251 | next PROPERTY; | |
1252 | } | |
1253 | ||
1254 | # The two parallel arrays must have the same number of elements. | |
1255 | if (@$invlist_ref != @$invmap_ref) { | |
1256 | fail("prop_invmap('$mod_prop')"); | |
1257 | diag("invlist has " | |
1258 | . scalar @$invlist_ref | |
1259 | . " while invmap has " | |
1260 | . scalar @$invmap_ref | |
1261 | . " elements"); | |
1262 | next PROPERTY; | |
1263 | } | |
1264 | ||
1265 | # The last element must be for the above-Unicode code points, and must be | |
1266 | # for the default value. | |
1267 | if ($invlist_ref->[-1] != 0x110000) { | |
1268 | fail("prop_invmap('$mod_prop')"); | |
1269 | diag("The last inversion list element is not 0x110000"); | |
1270 | next PROPERTY; | |
1271 | } | |
1272 | if ($invmap_ref->[-1] ne $missing) { | |
1273 | fail("prop_invmap('$mod_prop')"); | |
1274 | diag("The last inversion list element is '$invmap_ref->[-1]', and should be '$missing'"); | |
1275 | next PROPERTY; | |
1276 | } | |
1277 | ||
1278 | if ($name eq 'bmg') { # This one has an atypical $missing | |
1279 | if ($missing ne "") { | |
1280 | fail("prop_invmap('$mod_prop')"); | |
1281 | diag("The missings should be \"\"; got '$missing'"); | |
1282 | next PROPERTY; | |
1283 | } | |
1284 | } | |
4f143a72 | 1285 | elsif ($format =~ /^ a (?!r) /x) { |
b0b13ada KW |
1286 | if ($full_name eq 'Perl_Decimal_Digit') { |
1287 | if ($missing ne "") { | |
1288 | fail("prop_invmap('$mod_prop')"); | |
1289 | diag("The missings should be \"\"; got '$missing'"); | |
1290 | next PROPERTY; | |
1291 | } | |
1292 | } | |
1293 | elsif ($missing ne "0") { | |
bf7fe2df KW |
1294 | fail("prop_invmap('$mod_prop')"); |
1295 | diag("The missings should be '0'; got '$missing'"); | |
1296 | next PROPERTY; | |
1297 | } | |
1298 | } | |
62b3b855 KW |
1299 | elsif ($missing =~ /[<>]/) { |
1300 | fail("prop_invmap('$mod_prop')"); | |
1301 | diag("The missings should NOT be something with <...>'"); | |
1302 | next PROPERTY; | |
1303 | ||
1304 | # I don't want to hard code in what all the missings should be, so | |
1305 | # those don't get fully tested. | |
1306 | } | |
1307 | ||
1308 | # Certain properties don't have their own files, but must be constructed | |
1309 | # using proxies. | |
1310 | my $proxy_prop = $name; | |
1311 | if ($full_name eq 'Present_In') { | |
1312 | $proxy_prop = "age"; # The maps for these two props are identical | |
1313 | } | |
1314 | elsif ($full_name eq 'Simple_Case_Folding' | |
1315 | || $full_name =~ /Simple_ (.) .*? case_Mapping /x) | |
1316 | { | |
1317 | if ($full_name eq 'Simple_Case_Folding') { | |
1318 | $proxy_prop = 'cf'; | |
1319 | } | |
1320 | else { | |
1321 | # We captured the U, L, or T, leading to uc, lc, or tc. | |
1322 | $proxy_prop = lc $1 . "c"; | |
1323 | } | |
d11155ec | 1324 | if ($format ne "a") { |
62b3b855 | 1325 | fail("prop_invmap('$mod_prop')"); |
d11155ec | 1326 | diag("The format should be 'a'; got '$format'"); |
62b3b855 KW |
1327 | next PROPERTY; |
1328 | } | |
1329 | } | |
1330 | ||
5bbfa552 KW |
1331 | if ($format !~ / ^ (?: a [der]? | ale? | n | sl? ) $ /x) { |
1332 | fail("prop_invmap('$mod_prop')"); | |
1333 | diag("Unknown format '$format'"); | |
1334 | next PROPERTY; | |
1335 | } | |
1336 | ||
62b3b855 KW |
1337 | my $base_file; |
1338 | my $official; | |
1339 | ||
1340 | # Handle the properties that have full disk files for them (except the | |
1341 | # Name property which is structurally enough different that it is handled | |
1342 | # separately below.) | |
1343 | if ($name ne 'na' | |
1344 | && ($name eq 'blk' | |
1345 | || defined | |
1346 | ($base_file = $utf8::loose_property_to_file_of{$proxy_prop}) | |
1347 | || exists $utf8::loose_to_file_of{$proxy_prop} | |
1348 | || $name eq "dm")) | |
1349 | { | |
1350 | # In the above, blk is done unconditionally, as we need to test that | |
1351 | # the old-style block names are returned, even if mktables has | |
1352 | # generated a file for the new-style; the test for dm comes afterward, | |
1353 | # so that if a file has been generated for it explicitly, we use that | |
1354 | # file (which is valid, unlike blk) instead of the combo | |
1355 | # Decomposition.pl files. | |
1356 | my $file; | |
1357 | my $is_binary = 0; | |
1358 | if ($name eq 'blk') { | |
1359 | ||
1360 | # The blk property is special. The original file with old block | |
1361 | # names is retained, and the default is to not write out a | |
1362 | # new-name file. What we do is get the old names into a data | |
1363 | # structure, and from that create what the new file would look | |
1364 | # like. $base_file is needed to be defined, just to avoid a | |
1365 | # message below. | |
1366 | $base_file = "This is a dummy name"; | |
1367 | my $blocks_ref = charblocks(); | |
1368 | $official = ""; | |
1369 | for my $range (sort { $a->[0][0] <=> $b->[0][0] } | |
1370 | values %$blocks_ref) | |
1371 | { | |
1372 | # Translate the charblocks() data structure to what the file | |
1373 | # would like. | |
1374 | $official .= sprintf"%04X\t%04X\t%s\n", | |
1375 | $range->[0][0], | |
1376 | $range->[0][1], | |
1377 | $range->[0][2]; | |
1378 | } | |
1379 | } | |
1380 | else { | |
d11155ec | 1381 | $base_file = "Decomposition" if $format eq 'ad'; |
62b3b855 KW |
1382 | |
1383 | # Above leaves $base_file undefined only if it came from the hash | |
1384 | # below. This should happen only when it is a binary property | |
1385 | # (and are accessing via a single-form name, like 'In_Latin1'), | |
1386 | # and so it is stored in a different directory than the To ones. | |
1387 | # XXX Currently, the only cases where it is complemented are the | |
1388 | # ones that have no code points. And it works out for these that | |
1389 | # 1) complementing them, and then 2) adding or subtracting the | |
1390 | # initial 0 and final 110000 cancel each other out. But further | |
1391 | # work would be needed in the unlikely event that an inverted | |
1392 | # property comes along without these characteristics | |
1393 | if (!defined $base_file) { | |
1394 | $base_file = $utf8::loose_to_file_of{$proxy_prop}; | |
1395 | $is_binary = ($base_file =~ s/^!//) ? -1 : 1; | |
1396 | $base_file = "lib/$base_file"; | |
1397 | } | |
1398 | ||
1399 | # Read in the file | |
62b3b855 KW |
1400 | $file = "unicore/$base_file.pl"; |
1401 | $official = do $file; | |
1402 | ||
1403 | # Get rid of any trailing space and comments in the file. | |
1404 | $official =~ s/\s*(#.*)?$//mg; | |
1405 | ||
d11155ec | 1406 | if ($format eq 'ad') { |
bea2c146 KW |
1407 | my @official = split /\n/, $official; |
1408 | $official = ""; | |
1409 | foreach my $line (@official) { | |
1410 | my ($start, $end, $value) | |
1411 | = $line =~ / ^ (.+?) \t (.*?) \t (.+?) | |
1412 | \s* ( \# .* )? $ /x; | |
1413 | # Decomposition.pl also has the <compatible> types in it, | |
1414 | # which should be removed. | |
1415 | $value =~ s/<.*?> //; | |
1416 | $official .= "$start\t\t$value\n"; | |
1417 | ||
1418 | # If this is a multi-char range, we turn it into as many | |
1419 | # single character ranges as necessary. This makes things | |
1420 | # easier below. | |
1421 | if ($end ne "") { | |
1422 | for my $i (hex($start) + 1 .. hex $end) { | |
1423 | $official .= sprintf "%04X\t\t%s\n", $i, $value; | |
1424 | } | |
1425 | } | |
1426 | } | |
1427 | } | |
62b3b855 | 1428 | } |
6bbe4a24 | 1429 | local $/ = "\n"; |
62b3b855 | 1430 | chomp $official; |
6bbe4a24 | 1431 | $/ = $input_record_separator; |
62b3b855 | 1432 | |
3eb27b83 KW |
1433 | # Get the format for the file, and if there are any special elements, |
1434 | # get a reference to them. | |
bf7fe2df KW |
1435 | my $swash_name = $utf8::file_to_swash_name{$base_file}; |
1436 | my $specials_ref; | |
3eb27b83 | 1437 | my $file_format; |
bf7fe2df KW |
1438 | if ($swash_name) { |
1439 | $specials_ref = $utf8::SwashInfo{$swash_name}{'specials_name'}; | |
62b3b855 KW |
1440 | if ($specials_ref) { |
1441 | ||
1442 | # Convert from the name to the actual reference. | |
1443 | no strict 'refs'; | |
1444 | $specials_ref = \%{$specials_ref}; | |
1445 | } | |
3eb27b83 KW |
1446 | |
1447 | $file_format = $utf8::SwashInfo{$swash_name}{'format'}; | |
62b3b855 KW |
1448 | } |
1449 | ||
1450 | # Certain of the proxy properties have to be adjusted to match the | |
1451 | # real ones. | |
ae1bcb1f | 1452 | if ($full_name =~ /^(Case_Folding|(Lower|Title|Upper)case_Mapping)/) { |
62b3b855 KW |
1453 | |
1454 | # Here we have either | |
1455 | # 1) Case_Folding; or | |
1456 | # 2) a proxy that is a full mapping, which means that what the | |
1457 | # real property is is the equivalent simple mapping. | |
1458 | # In both cases, the file will have a standard list containing | |
1459 | # simple mappings (to a single code point), and a specials hash | |
1460 | # which contains all the mappings that are to multiple code | |
1461 | # points. First, extract a list containing all the file's simple | |
1462 | # mappings. | |
1463 | my @list; | |
1464 | for (split "\n", $official) { | |
1465 | my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?) | |
bea2c146 | 1466 | \s* ( \# .* )? $ /x; |
62b3b855 | 1467 | $end = $start if $end eq ""; |
bf7fe2df | 1468 | push @list, [ hex $start, hex $end, $value ]; |
62b3b855 KW |
1469 | } |
1470 | ||
ae1bcb1f | 1471 | # For these mappings, the file contains all the simple mappings, |
62b3b855 | 1472 | # including the ones that are overridden by the specials. These |
ae1bcb1f | 1473 | # need to be removed as the list is for just the full ones. |
62b3b855 KW |
1474 | |
1475 | # Go through any special mappings one by one. They are packed. | |
1476 | my $i = 0; | |
1477 | foreach my $utf8_cp (sort keys %$specials_ref) { | |
1478 | my $cp = unpack("C0U", $utf8_cp); | |
1479 | ||
62b3b855 KW |
1480 | # Find the spot in the @list of simple mappings that this |
1481 | # special applies to; uses a linear search. | |
1482 | while ($i < @list -1 ) { | |
bea2c146 | 1483 | last if $cp <= $list[$i][1]; |
62b3b855 KW |
1484 | $i++; |
1485 | } | |
1486 | ||
bea2c146 KW |
1487 | # Here $i is such that it points to the first range which ends |
1488 | # at or above cp, and hence is the only range that could | |
1489 | # possibly contain it. | |
1490 | ||
1491 | # If not in this range, no range contains it: nothing to | |
1492 | # remove. | |
1493 | next if $cp < $list[$i][0]; | |
1494 | ||
1495 | # Otherwise, remove the existing entry. If it is the first | |
1496 | # element of the range... | |
1497 | if ($cp == $list[$i][0]) { | |
62b3b855 | 1498 | |
bea2c146 KW |
1499 | # ... and there are other elements in the range, just shorten |
1500 | # the range to exclude this code point. | |
1501 | if ($list[$i][1] > $list[$i][0]) { | |
1502 | $list[$i][0]++; | |
1503 | } | |
62b3b855 | 1504 | |
bea2c146 KW |
1505 | # ... but if it is the only element in the range, remove |
1506 | # it entirely. | |
1507 | else { | |
1508 | splice @list, $i, 1; | |
1509 | } | |
1510 | } | |
1511 | else { # Is somewhere in the middle of the range | |
1512 | # Split the range into two, excluding this one in the | |
1513 | # middle | |
1514 | splice @list, $i, 1, | |
1515 | [ $list[$i][0], $cp - 1, $list[$i][2] ], | |
1516 | [ $cp + 1, $list[$i][1], $list[$i][2] ]; | |
1517 | } | |
62b3b855 KW |
1518 | } |
1519 | ||
1520 | # Here, have gone through all the specials, modifying @list as | |
1521 | # needed. Turn it back into what the file should look like. | |
bf7fe2df KW |
1522 | $official = ""; |
1523 | for my $element (@list) { | |
1524 | $official .= "\n" if $official; | |
1525 | if ($element->[1] == $element->[0]) { | |
1526 | $official .= sprintf "%04X\t\t%s", $element->[0], $element->[2]; | |
1527 | } | |
1528 | else { | |
1529 | $official .= sprintf "%04X\t%04X\t%s", $element->[0], $element->[1], $element->[2]; | |
1530 | } | |
1531 | } | |
62b3b855 | 1532 | } |
ae1bcb1f KW |
1533 | elsif ($full_name =~ /Simple_(Case_Folding|(Lower|Title|Upper)case_Mapping)/) |
1534 | { | |
62b3b855 | 1535 | |
ae1bcb1f | 1536 | # These properties have everything in the regular array, and the |
62b3b855 | 1537 | # specials are superfluous. |
ae1bcb1f | 1538 | undef $specials_ref; |
62b3b855 | 1539 | } |
3eb27b83 | 1540 | elsif ($format !~ /^a/ && defined $file_format && $file_format eq 'x') { |
bf7fe2df | 1541 | |
3eb27b83 KW |
1542 | # For these properties the file is output using hex notation for the |
1543 | # map. Convert from hex to decimal. | |
bf7fe2df KW |
1544 | my @lines = split "\n", $official; |
1545 | foreach my $line (@lines) { | |
3eb27b83 KW |
1546 | my ($lower, $upper, $map) = split "\t", $line; |
1547 | $line = "$lower\t$upper\t" . hex $map; | |
bf7fe2df KW |
1548 | } |
1549 | $official = join "\n", @lines; | |
1550 | } | |
62b3b855 KW |
1551 | |
1552 | # Here, in $official, we have what the file looks like, or should like | |
1553 | # if we've had to fix it up. Now take the invmap() output and reverse | |
1554 | # engineer from that what the file should look like. Each iteration | |
1555 | # appends the next line to the running string. | |
1556 | my $tested_map = ""; | |
1557 | ||
1558 | # Create a copy of the file's specials hash. (It has been undef'd if | |
1559 | # we know it isn't relevant to this property, so if it exists, it's an | |
1560 | # error or is relevant). As we go along, we delete from that copy. | |
1561 | # If a delete fails, or something is left over after we are done, | |
1562 | # it's an error | |
1563 | my %specials = %$specials_ref if $specials_ref; | |
1564 | ||
1565 | # The extra -1 is because the final element has been tested above to | |
1566 | # be for anything above Unicode. The file doesn't go that high. | |
bea2c146 | 1567 | for (my $i = 0; $i < @$invlist_ref - 1; $i++) { |
62b3b855 KW |
1568 | |
1569 | # If the map element is a reference, have to stringify it (but | |
1570 | # don't do so if the format doesn't allow references, so that an | |
1571 | # improper format will generate an error. | |
1572 | if (ref $invmap_ref->[$i] | |
d11155ec | 1573 | && ($format eq 'ad' || $format =~ /^ . l /x)) |
62b3b855 | 1574 | { |
3b6a8189 | 1575 | # The stringification depends on the format. |
62b3b855 | 1576 | if ($format eq 'sl') { |
3b6a8189 KW |
1577 | |
1578 | # At the time of this writing, there are two types of 'sl' | |
1579 | # format One, in Name_Alias, has multiple separate entries | |
1580 | # for each code point; the other, in Script_Extension, is space | |
1581 | # separated. Assume the latter for non-Name_Alias. | |
1582 | if ($full_name ne 'Name_Alias') { | |
1583 | $invmap_ref->[$i] = join " ", @{$invmap_ref->[$i]}; | |
1584 | } | |
1585 | else { | |
1586 | # For Name_Alias, we emulate the file. Entries with | |
1587 | # just one value don't need any changes, but we | |
1588 | # convert the list entries into a series of lines for | |
1589 | # the file, starting with the first name. The | |
1590 | # succeeding entries are on separate lines, with the | |
1591 | # code point repeated for each one and then two tabs, | |
1592 | # then the value. Code at the end of the loop will | |
1593 | # set up the first line with its code point and two | |
1594 | # tabs before the value, just as it does for every | |
1595 | # other property; thus the special handling of the | |
1596 | # first line. | |
1597 | if (ref $invmap_ref->[$i]) { | |
1598 | my $hex_cp = sprintf("%04X", $invlist_ref->[$i]); | |
1599 | my $concatenated = $invmap_ref->[$i][0]; | |
1600 | for (my $j = 1; $j < @{$invmap_ref->[$i]}; $j++) { | |
1601 | $concatenated .= "\n$hex_cp\t\t" . $invmap_ref->[$i][$j]; | |
1602 | } | |
1603 | $invmap_ref->[$i] = $concatenated; | |
1604 | } | |
1605 | } | |
62b3b855 | 1606 | } |
d11155ec | 1607 | elsif ($format =~ / ^ al e? $/x) { |
62b3b855 | 1608 | |
d11155ec | 1609 | # For a al property, the stringified result should be in |
62b3b855 KW |
1610 | # the specials hash. The key is the packed code point, |
1611 | # and the value is the packed map. | |
1612 | my $value; | |
1613 | if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) { | |
1614 | fail("prop_invmap('$mod_prop')"); | |
1615 | diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]); | |
1616 | next PROPERTY; | |
1617 | } | |
1618 | my $packed = pack "U*", @{$invmap_ref->[$i]}; | |
1619 | if ($value ne $packed) { | |
1620 | fail("prop_invmap('$mod_prop')"); | |
1621 | diag(sprintf "For %04X, expected the mapping to be '$packed', but got '$value'"); | |
1622 | next PROPERTY; | |
1623 | } | |
1624 | ||
1625 | # As this doesn't get tested when we later compare with | |
1626 | # the actual file, it could be out of order and we | |
1627 | # wouldn't know it. | |
1628 | if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) | |
1629 | || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) | |
1630 | { | |
1631 | fail("prop_invmap('$mod_prop')"); | |
1632 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1633 | next PROPERTY; | |
1634 | } | |
1635 | next; | |
1636 | } | |
d11155ec | 1637 | elsif ($format eq 'ad') { |
62b3b855 KW |
1638 | |
1639 | # The decomposition mapping file has the code points as | |
1640 | # a string of space-separated hex constants. | |
1641 | $invmap_ref->[$i] = join " ", map { sprintf "%04X", $_ } @{$invmap_ref->[$i]}; | |
1642 | } | |
1643 | else { | |
1644 | fail("prop_invmap('$mod_prop')"); | |
1645 | diag("Can't handle format '$format'"); | |
1646 | next PROPERTY; | |
1647 | } | |
1648 | } | |
d11155ec | 1649 | elsif ($format eq 'ad' || $format eq 'ale') { |
34132297 | 1650 | |
d11155ec KW |
1651 | # The numerics in the returned map are stored as adjusted |
1652 | # decimal integers. The defaults are 0, and don't appear in | |
1653 | # $official, and are excluded later, but the elements must be | |
1654 | # converted back to their hex values before comparing with | |
34132297 | 1655 | # $official, as these files, for backwards compatibility, are |
d11155ec | 1656 | # not stored as adjusted. (There currently is only one ale |
34132297 | 1657 | # property, nfkccf. If that changed this would also have to.) |
bea2c146 KW |
1658 | if ($invmap_ref->[$i] =~ / ^ -? \d+ $ /x |
1659 | && $invmap_ref->[$i] != 0) | |
1660 | { | |
d11155ec KW |
1661 | my $next = $invmap_ref->[$i] + 1; |
1662 | $invmap_ref->[$i] = sprintf("%04X", $invmap_ref->[$i]); | |
1663 | ||
1664 | # If there are other elements in this range they need to | |
1665 | # be adjusted; they must individually be re-mapped. Do | |
1666 | # this by splicing in a new element into the list and the | |
1667 | # map containing the remainder of the range. Next time | |
1668 | # through we will look at that (possibly splicing again | |
1669 | # until the whole range is processed). | |
bea2c146 KW |
1670 | if ($invlist_ref->[$i+1] > $invlist_ref->[$i] + 1) { |
1671 | splice @$invlist_ref, $i+1, 0, | |
1672 | $invlist_ref->[$i] + 1; | |
d11155ec | 1673 | splice @$invmap_ref, $i+1, 0, $next; |
bea2c146 KW |
1674 | } |
1675 | } | |
d11155ec | 1676 | if ($format eq 'ale' && $invmap_ref->[$i] eq "") { |
62b3b855 | 1677 | |
d11155ec | 1678 | # ale properties have maps to the empty string that also |
4066e594 KW |
1679 | # should be in the specials hash, with the key the packed |
1680 | # code point, and the map just empty. | |
1681 | my $value; | |
1682 | if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) { | |
1683 | fail("prop_invmap('$mod_prop')"); | |
1684 | diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]); | |
1685 | next PROPERTY; | |
1686 | } | |
1687 | if ($value ne "") { | |
1688 | fail("prop_invmap('$mod_prop')"); | |
1689 | diag(sprintf "For %04X, expected the mapping to be \"\", but got '$value'", $invlist_ref->[$i]); | |
1690 | next PROPERTY; | |
1691 | } | |
62b3b855 | 1692 | |
4066e594 KW |
1693 | # As this doesn't get tested when we later compare with |
1694 | # the actual file, it could be out of order and we | |
1695 | # wouldn't know it. | |
1696 | if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) | |
1697 | || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) | |
1698 | { | |
1699 | fail("prop_invmap('$mod_prop')"); | |
1700 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1701 | next PROPERTY; | |
1702 | } | |
1703 | next; | |
34132297 | 1704 | } |
62b3b855 KW |
1705 | } |
1706 | elsif ($is_binary) { # These binary files don't have an explicit Y | |
1707 | $invmap_ref->[$i] =~ s/Y//; | |
1708 | } | |
1709 | ||
1710 | # The file doesn't include entries that map to $missing, so don't | |
1711 | # include it in the built-up string. But make sure that it is in | |
1712 | # the correct order in the input. | |
1713 | if ($invmap_ref->[$i] eq $missing) { | |
1714 | if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) | |
1715 | || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) | |
1716 | { | |
1717 | fail("prop_invmap('$mod_prop')"); | |
1718 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1719 | next PROPERTY; | |
1720 | } | |
1721 | next; | |
1722 | } | |
1723 | ||
7f131b96 KW |
1724 | # The ad property has one entry which isn't in the file. |
1725 | # Ignore it, but make sure it is in order. | |
1726 | if ($format eq 'ad' | |
1727 | && $invmap_ref->[$i] eq '<hangul syllable>' | |
1728 | && $invlist_ref->[$i] == 0xAC00) | |
1729 | { | |
1730 | if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) | |
1731 | || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) | |
62b3b855 | 1732 | { |
7f131b96 KW |
1733 | fail("prop_invmap('$mod_prop')"); |
1734 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1735 | next PROPERTY; | |
62b3b855 | 1736 | } |
7f131b96 KW |
1737 | next; |
1738 | } | |
62b3b855 KW |
1739 | |
1740 | # Finally have figured out what the map column in the file should | |
1741 | # be. Append the line to the running string. | |
1742 | my $start = $invlist_ref->[$i]; | |
1743 | my $end = $invlist_ref->[$i+1] - 1; | |
1744 | $end = ($start == $end) ? "" : sprintf("%04X", $end); | |
1745 | if ($invmap_ref->[$i] ne "") { | |
1746 | $tested_map .= sprintf "%04X\t%s\t%s\n", $start, $end, $invmap_ref->[$i]; | |
1747 | } | |
1748 | elsif ($end ne "") { | |
1749 | $tested_map .= sprintf "%04X\t%s\n", $start, $end; | |
1750 | } | |
1751 | else { | |
1752 | $tested_map .= sprintf "%04X\n", $start; | |
1753 | } | |
1754 | } # End of looping over all elements. | |
1755 | ||
1756 | # Here are done with generating what the file should look like | |
1757 | ||
6bbe4a24 | 1758 | local $/ = "\n"; |
62b3b855 | 1759 | chomp $tested_map; |
6bbe4a24 | 1760 | $/ = $input_record_separator; |
62b3b855 KW |
1761 | |
1762 | # And compare. | |
1763 | if ($tested_map ne $official) { | |
1764 | fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap"); | |
1765 | next PROPERTY; | |
1766 | } | |
1767 | ||
1768 | # There shouldn't be any specials unaccounted for. | |
1769 | if (keys %specials) { | |
1770 | fail("prop_invmap('$mod_prop')"); | |
1771 | diag("Unexpected specials: " . join ", ", keys %specials); | |
1772 | next PROPERTY; | |
1773 | } | |
1774 | } | |
1775 | elsif ($format eq 'n') { | |
1776 | ||
1777 | # Handle the Name property similar to the above. But the file is | |
1778 | # sufficiently different that it is more convenient to make a special | |
3b6a8189 KW |
1779 | # case for it. It is a combination of the Name, Unicode1_Name, and |
1780 | # Name_Alias properties, and named sequences. We need to remove all | |
1781 | # but the Name in order to do the comparison. | |
62b3b855 KW |
1782 | |
1783 | if ($missing ne "") { | |
1784 | fail("prop_invmap('$mod_prop')"); | |
1785 | diag("The missings should be \"\"; got \"missing\""); | |
1786 | next PROPERTY; | |
1787 | } | |
1788 | ||
1789 | $official = do "unicore/Name.pl"; | |
1790 | ||
1791 | # Get rid of the named sequences portion of the file. These don't | |
1792 | # have a tab before the first blank on a line. | |
1793 | $official =~ s/ ^ [^\t]+ \ .*? \n //xmg; | |
1794 | ||
1795 | # And get rid of the controls. These are named in the file, but | |
3b6a8189 KW |
1796 | # shouldn't be in the property. This gets rid of the two ranges in |
1797 | # one fell swoop, and also all the Unicode1_Name values that may not | |
1798 | # be in Name_Alias. | |
62b3b855 KW |
1799 | $official =~ s/ 00000 \t .* 0001F .*? \n//xs; |
1800 | $official =~ s/ 0007F \t .* 0009F .*? \n//xs; | |
1801 | ||
3b6a8189 KW |
1802 | # And remove the aliases. We read in the Name_Alias property, and go |
1803 | # through them one by one. | |
1804 | my ($aliases_code_points, $aliases_maps, undef, undef) | |
1805 | = &prop_invmap('Name_Alias'); | |
1806 | for (my $i = 0; $i < @$aliases_code_points; $i++) { | |
1807 | my $code_point = $aliases_code_points->[$i]; | |
1808 | ||
1809 | # Already removed these above. | |
1810 | next if $code_point <= 0x1F | |
1811 | || ($code_point >= 0x7F && $code_point <= 0x9F); | |
1812 | ||
1813 | my $hex_code_point = sprintf "%05X", $code_point; | |
1814 | ||
1815 | # Convert to a list if not already to make the following loop | |
1816 | # control uniform. | |
1817 | $aliases_maps->[$i] = [ $aliases_maps->[$i] ] | |
1818 | if ! ref $aliases_maps->[$i]; | |
1819 | ||
1820 | # Remove each alias for this code point from the file | |
1821 | foreach my $alias (@{$aliases_maps->[$i]}) { | |
1822 | ||
1823 | # Remove the alias type from the entry, retaining just the name. | |
1824 | $alias =~ s/:.*//; | |
1825 | ||
1826 | $alias = quotemeta($alias); | |
1827 | $official =~ s/$hex_code_point \t $alias \n //x; | |
62b3b855 KW |
1828 | } |
1829 | } | |
6bbe4a24 | 1830 | local $/ = "\n"; |
62b3b855 | 1831 | chomp $official; |
6bbe4a24 | 1832 | $/ = $input_record_separator; |
62b3b855 KW |
1833 | |
1834 | # Here have adjusted the file. We also have to adjust the returned | |
1835 | # inversion map by checking and deleting all the lines in it that | |
1836 | # won't be in the file. These are the lines that have generated | |
1837 | # things, like <hangul syllable>. | |
1838 | my $tested_map = ""; # Current running string | |
1839 | my @code_point_in_names = | |
1840 | @Unicode::UCD::code_points_ending_in_code_point; | |
1841 | ||
1842 | for my $i (0 .. @$invlist_ref - 1 - 1) { | |
1843 | my $start = $invlist_ref->[$i]; | |
1844 | my $end = $invlist_ref->[$i+1] - 1; | |
1845 | if ($invmap_ref->[$i] eq $missing) { | |
1846 | if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) | |
1847 | || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) | |
1848 | { | |
1849 | fail("prop_invmap('$mod_prop')"); | |
1850 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1851 | next PROPERTY; | |
1852 | } | |
1853 | next; | |
1854 | } | |
1855 | if ($invmap_ref->[$i] =~ / (.*) ( < .*? > )/x) { | |
1856 | my $name = $1; | |
1857 | my $type = $2; | |
1858 | if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) | |
1859 | || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) | |
1860 | { | |
1861 | fail("prop_invmap('$mod_prop')"); | |
1862 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1863 | next PROPERTY; | |
1864 | } | |
1865 | if ($type eq "<hangul syllable>") { | |
1866 | if ($name ne "") { | |
1867 | fail("prop_invmap('$mod_prop')"); | |
1868 | diag("Unexpected text in $invmap_ref->[$i]"); | |
1869 | next PROPERTY; | |
1870 | } | |
1871 | if ($start != 0xAC00) { | |
1872 | fail("prop_invmap('$mod_prop')"); | |
1873 | diag(sprintf("<hangul syllables> should begin at 0xAC00, got %04X", $start)); | |
1874 | next PROPERTY; | |
1875 | } | |
1876 | if ($end != $start + 11172 - 1) { | |
1877 | fail("prop_invmap('$mod_prop')"); | |
1878 | diag(sprintf("<hangul syllables> should end at %04X, got %04X", $start + 11172 -1, $end)); | |
1879 | next PROPERTY; | |
1880 | } | |
1881 | } | |
1882 | elsif ($type ne "<code point>") { | |
1883 | fail("prop_invmap('$mod_prop')"); | |
1884 | diag("Unexpected text '$type' in $invmap_ref->[$i]"); | |
1885 | next PROPERTY; | |
1886 | } | |
1887 | else { | |
1888 | ||
1889 | # Look through the array of names that end in code points, | |
1890 | # and look for this start and end. If not found is an | |
1891 | # error. If found, delete it, and at the end, make sure | |
1892 | # have deleted everything. | |
1893 | for my $i (0 .. @code_point_in_names - 1) { | |
1894 | my $hash = $code_point_in_names[$i]; | |
1895 | if ($hash->{'low'} == $start | |
1896 | && $hash->{'high'} == $end | |
1897 | && "$hash->{'name'}-" eq $name) | |
1898 | { | |
1899 | splice @code_point_in_names, $i, 1; | |
1900 | last; | |
1901 | } | |
1902 | else { | |
1903 | fail("prop_invmap('$mod_prop')"); | |
1904 | diag("Unexpected code-point-in-name line '$invmap_ref->[$i]'"); | |
1905 | next PROPERTY; | |
1906 | } | |
1907 | } | |
1908 | } | |
1909 | ||
1910 | next; | |
1911 | } | |
1912 | ||
1913 | # Have adjusted the map, as needed. Append to running string. | |
1914 | $end = ($start == $end) ? "" : sprintf("%05X", $end); | |
1915 | $tested_map .= sprintf "%05X\t%s\n", $start, $invmap_ref->[$i]; | |
1916 | } | |
1917 | ||
1918 | # Finished creating the string from the inversion map. Can compare | |
1919 | # with what the file is. | |
6bbe4a24 | 1920 | local $/ = "\n"; |
62b3b855 | 1921 | chomp $tested_map; |
6bbe4a24 | 1922 | $/ = $input_record_separator; |
62b3b855 KW |
1923 | if ($tested_map ne $official) { |
1924 | fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap"); | |
1925 | next PROPERTY; | |
1926 | } | |
1927 | if (@code_point_in_names) { | |
1928 | fail("prop_invmap('$mod_prop')"); | |
1929 | use Data::Dumper; | |
1930 | diag("Missing code-point-in-name line(s)" . Dumper \@code_point_in_names); | |
1931 | next PROPERTY; | |
1932 | } | |
1933 | } | |
4f143a72 | 1934 | elsif ($format eq 's') { |
62b3b855 KW |
1935 | |
1936 | # Here the map is not more or less directly from a file stored on | |
1937 | # disk. We try a different tack. These should all be properties that | |
1938 | # have just a few possible values (most of them are binary). We go | |
1939 | # through the map list, sorting each range into buckets, one for each | |
1940 | # map value. Thus for binary properties there will be a bucket for Y | |
1941 | # and one for N. The buckets are inversion lists. We compare each | |
1942 | # constructed inversion list with what we would get for it using | |
1943 | # prop_invlist(), which has already been tested. If they all match, | |
1944 | # the whole map must have matched. | |
1945 | my %maps; | |
1946 | my $previous_map; | |
1947 | ||
1948 | # (The extra -1 is to not look at the final element in the loop, which | |
1949 | # we know is the one that starts just beyond Unicode and goes to | |
1950 | # infinity.) | |
1951 | for my $i (0 .. @$invlist_ref - 1 - 1) { | |
1952 | my $range_start = $invlist_ref->[$i]; | |
1953 | ||
1954 | # Because we are sorting into buckets, things could be | |
1955 | # out-of-order here, and still be in the correct order in the | |
1956 | # bucket, and hence wouldn't show up as an error; so have to | |
1957 | # check. | |
1958 | if (($i > 0 && $range_start <= $invlist_ref->[$i-1]) | |
1959 | || $range_start >= $invlist_ref->[$i+1]) | |
1960 | { | |
1961 | fail("prop_invmap('$mod_prop')"); | |
1962 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1963 | next PROPERTY; | |
1964 | } | |
1965 | ||
1966 | # This new range closes out the range started in the previous | |
1967 | # iteration. | |
1968 | push @{$maps{$previous_map}}, $range_start if defined $previous_map; | |
1969 | ||
1970 | # And starts a range which will be closed in the next iteration. | |
1971 | $previous_map = $invmap_ref->[$i]; | |
1972 | push @{$maps{$previous_map}}, $range_start; | |
1973 | } | |
1974 | ||
1975 | # The range we just started hasn't been closed, and we didn't look at | |
1976 | # the final element of the loop. If that range is for the default | |
1977 | # value, it shouldn't be closed, as it is to extend to infinity. But | |
1978 | # otherwise, it should end at the final Unicode code point, and the | |
1979 | # list that maps to the default value should have another element that | |
1980 | # does go to infinity for every above Unicode code point. | |
1981 | ||
1982 | if (@$invlist_ref > 1) { | |
1983 | my $penultimate_map = $invmap_ref->[-2]; | |
1984 | if ($penultimate_map ne $missing) { | |
1985 | ||
1986 | # The -1th element contains the first non-Unicode code point. | |
1987 | push @{$maps{$penultimate_map}}, $invlist_ref->[-1]; | |
1988 | push @{$maps{$missing}}, $invlist_ref->[-1]; | |
1989 | } | |
1990 | } | |
1991 | ||
1992 | # Here, we have the buckets (inversion lists) all constructed. Go | |
1993 | # through each and verify that matches what prop_invlist() returns. | |
1994 | # We could use is_deeply() for the comparison, but would get multiple | |
1995 | # messages for each $prop. | |
1996 | foreach my $map (keys %maps) { | |
1997 | my @off_invlist = prop_invlist("$prop = $map"); | |
1998 | my $min = (@off_invlist >= @{$maps{$map}}) | |
1999 | ? @off_invlist | |
2000 | : @{$maps{$map}}; | |
2001 | for my $i (0 .. $min- 1) { | |
2002 | if ($i > @off_invlist - 1) { | |
2003 | fail("prop_invmap('$mod_prop')"); | |
2004 | diag("There is no element [$i] for $prop=$map from prop_invlist(), while [$i] in the implicit one constructed from prop_invmap() is '$maps{$map}[$i]'"); | |
2005 | next PROPERTY; | |
2006 | } | |
2007 | elsif ($i > @{$maps{$map}} - 1) { | |
2008 | fail("prop_invmap('$mod_prop')"); | |
2009 | diag("There is no element [$i] from the implicit $prop=$map constructed from prop_invmap(), while [$i] in the one from prop_invlist() is '$off_invlist[$i]'"); | |
2010 | next PROPERTY; | |
2011 | } | |
2012 | elsif ($maps{$map}[$i] ne $off_invlist[$i]) { | |
2013 | fail("prop_invmap('$mod_prop')"); | |
2014 | diag("Element [$i] of the implicit $prop=$map constructed from prop_invmap() is '$maps{$map}[$i]', and the one from prop_invlist() is '$off_invlist[$i]'"); | |
2015 | next PROPERTY; | |
2016 | } | |
2017 | } | |
2018 | } | |
2019 | } | |
2020 | else { # Don't know this property nor format. | |
2021 | ||
2022 | fail("prop_invmap('$mod_prop')"); | |
2023 | diag("Unknown format '$format'"); | |
2024 | } | |
2025 | ||
2026 | pass("prop_invmap('$mod_prop')"); | |
2027 | } | |
2028 | ||
1fdd5e53 KW |
2029 | # A few tests of search_invlist |
2030 | use Unicode::UCD qw(search_invlist); | |
2031 | ||
2032 | my ($scripts_ranges_ref, $scripts_map_ref) = prop_invmap("Script"); | |
2033 | my $index = search_invlist($scripts_ranges_ref, 0x390); | |
2034 | is($scripts_map_ref->[$index], "Greek", "U+0390 is Greek"); | |
2035 | my @alpha_invlist = prop_invlist("Alpha"); | |
2036 | is(search_invlist(\@alpha_invlist, ord("\t")), undef, "search_invlist returns undef for code points before first one on the list"); | |
2037 | ||
6bbe4a24 | 2038 | ok($/ eq $input_record_separator, "The record separator didn't get overridden"); |
eaebe4db | 2039 | done_testing(); |