Commit | Line | Data |
---|---|---|
25a47338 | 1 | #!perl -w |
8b731da2 | 2 | BEGIN { |
a452d459 | 3 | if (ord("A") != 65) { |
8b731da2 JH |
4 | print "1..0 # Skip: EBCDIC\n"; |
5 | exit 0; | |
6 | } | |
a778afa6 AD |
7 | chdir 't' if -d 't'; |
8 | @INC = '../lib'; | |
25a47338 NC |
9 | require Config; import Config; |
10 | if ($Config{'extensions'} !~ /\bStorable\b/) { | |
11 | print "1..0 # Skip: Storable was not built; Unicode::UCD uses Storable\n"; | |
12 | exit 0; | |
13 | } | |
8b731da2 JH |
14 | } |
15 | ||
a778afa6 AD |
16 | use strict; |
17 | use Unicode::UCD; | |
f5c9f3db | 18 | use Test::More; |
8b731da2 | 19 | |
55d7b906 | 20 | use Unicode::UCD 'charinfo'; |
561c79ed | 21 | |
b08cd201 JH |
22 | my $charinfo; |
23 | ||
9e92970c KW |
24 | is(charinfo(0x110000), undef, "Verify charinfo() of non-unicode is undef"); |
25 | ||
e10d7780 KW |
26 | $charinfo = charinfo(0); # Null is often problematic, so test it. |
27 | ||
28 | is($charinfo->{code}, '0000', '<control>'); | |
29 | is($charinfo->{name}, '<control>'); | |
30 | is($charinfo->{category}, 'Cc'); | |
31 | is($charinfo->{combining}, '0'); | |
32 | is($charinfo->{bidi}, 'BN'); | |
33 | is($charinfo->{decomposition}, ''); | |
34 | is($charinfo->{decimal}, ''); | |
35 | is($charinfo->{digit}, ''); | |
36 | is($charinfo->{numeric}, ''); | |
37 | is($charinfo->{mirrored}, 'N'); | |
38 | is($charinfo->{unicode10}, 'NULL'); | |
39 | is($charinfo->{comment}, ''); | |
40 | is($charinfo->{upper}, ''); | |
41 | is($charinfo->{lower}, ''); | |
42 | is($charinfo->{title}, ''); | |
43 | is($charinfo->{block}, 'Basic Latin'); | |
44 | is($charinfo->{script}, 'Common'); | |
45 | ||
b08cd201 JH |
46 | $charinfo = charinfo(0x41); |
47 | ||
f5c9f3db JH |
48 | is($charinfo->{code}, '0041', 'LATIN CAPITAL LETTER A'); |
49 | is($charinfo->{name}, 'LATIN CAPITAL LETTER A'); | |
50 | is($charinfo->{category}, 'Lu'); | |
51 | is($charinfo->{combining}, '0'); | |
52 | is($charinfo->{bidi}, 'L'); | |
53 | is($charinfo->{decomposition}, ''); | |
54 | is($charinfo->{decimal}, ''); | |
55 | is($charinfo->{digit}, ''); | |
56 | is($charinfo->{numeric}, ''); | |
57 | is($charinfo->{mirrored}, 'N'); | |
58 | is($charinfo->{unicode10}, ''); | |
59 | is($charinfo->{comment}, ''); | |
60 | is($charinfo->{upper}, ''); | |
61 | is($charinfo->{lower}, '0061'); | |
62 | is($charinfo->{title}, ''); | |
63 | is($charinfo->{block}, 'Basic Latin'); | |
64 | is($charinfo->{script}, 'Latin'); | |
b08cd201 JH |
65 | |
66 | $charinfo = charinfo(0x100); | |
67 | ||
f5c9f3db JH |
68 | is($charinfo->{code}, '0100', 'LATIN CAPITAL LETTER A WITH MACRON'); |
69 | is($charinfo->{name}, 'LATIN CAPITAL LETTER A WITH MACRON'); | |
70 | is($charinfo->{category}, 'Lu'); | |
71 | is($charinfo->{combining}, '0'); | |
72 | is($charinfo->{bidi}, 'L'); | |
73 | is($charinfo->{decomposition}, '0041 0304'); | |
74 | is($charinfo->{decimal}, ''); | |
75 | is($charinfo->{digit}, ''); | |
76 | is($charinfo->{numeric}, ''); | |
77 | is($charinfo->{mirrored}, 'N'); | |
78 | is($charinfo->{unicode10}, 'LATIN CAPITAL LETTER A MACRON'); | |
79 | is($charinfo->{comment}, ''); | |
80 | is($charinfo->{upper}, ''); | |
81 | is($charinfo->{lower}, '0101'); | |
82 | is($charinfo->{title}, ''); | |
83 | is($charinfo->{block}, 'Latin Extended-A'); | |
84 | is($charinfo->{script}, 'Latin'); | |
a196fbfd JH |
85 | |
86 | # 0x0590 is in the Hebrew block but unused. | |
561c79ed | 87 | |
b08cd201 JH |
88 | $charinfo = charinfo(0x590); |
89 | ||
f5c9f3db JH |
90 | is($charinfo->{code}, undef, '0x0590 - unused Hebrew'); |
91 | is($charinfo->{name}, undef); | |
92 | is($charinfo->{category}, undef); | |
93 | is($charinfo->{combining}, undef); | |
94 | is($charinfo->{bidi}, undef); | |
95 | is($charinfo->{decomposition}, undef); | |
96 | is($charinfo->{decimal}, undef); | |
97 | is($charinfo->{digit}, undef); | |
98 | is($charinfo->{numeric}, undef); | |
99 | is($charinfo->{mirrored}, undef); | |
100 | is($charinfo->{unicode10}, undef); | |
101 | is($charinfo->{comment}, undef); | |
102 | is($charinfo->{upper}, undef); | |
103 | is($charinfo->{lower}, undef); | |
104 | is($charinfo->{title}, undef); | |
105 | is($charinfo->{block}, undef); | |
106 | is($charinfo->{script}, undef); | |
a196fbfd JH |
107 | |
108 | # 0x05d0 is in the Hebrew block and used. | |
561c79ed | 109 | |
b08cd201 JH |
110 | $charinfo = charinfo(0x5d0); |
111 | ||
f5c9f3db JH |
112 | is($charinfo->{code}, '05D0', '05D0 - used Hebrew'); |
113 | is($charinfo->{name}, 'HEBREW LETTER ALEF'); | |
114 | is($charinfo->{category}, 'Lo'); | |
115 | is($charinfo->{combining}, '0'); | |
116 | is($charinfo->{bidi}, 'R'); | |
117 | is($charinfo->{decomposition}, ''); | |
118 | is($charinfo->{decimal}, ''); | |
119 | is($charinfo->{digit}, ''); | |
120 | is($charinfo->{numeric}, ''); | |
121 | is($charinfo->{mirrored}, 'N'); | |
122 | is($charinfo->{unicode10}, ''); | |
123 | is($charinfo->{comment}, ''); | |
124 | is($charinfo->{upper}, ''); | |
125 | is($charinfo->{lower}, ''); | |
126 | is($charinfo->{title}, ''); | |
127 | is($charinfo->{block}, 'Hebrew'); | |
128 | is($charinfo->{script}, 'Hebrew'); | |
561c79ed | 129 | |
74f8133e | 130 | # An open syllable in Hangul. |
a6fa416b TS |
131 | |
132 | $charinfo = charinfo(0xAC00); | |
133 | ||
05dbc6f8 KW |
134 | is($charinfo->{code}, 'AC00', 'HANGUL SYLLABLE U+AC00'); |
135 | is($charinfo->{name}, 'HANGUL SYLLABLE GA'); | |
f5c9f3db JH |
136 | is($charinfo->{category}, 'Lo'); |
137 | is($charinfo->{combining}, '0'); | |
138 | is($charinfo->{bidi}, 'L'); | |
05dbc6f8 | 139 | is($charinfo->{decomposition}, '1100 1161'); |
f5c9f3db JH |
140 | is($charinfo->{decimal}, ''); |
141 | is($charinfo->{digit}, ''); | |
142 | is($charinfo->{numeric}, ''); | |
143 | is($charinfo->{mirrored}, 'N'); | |
144 | is($charinfo->{unicode10}, ''); | |
145 | is($charinfo->{comment}, ''); | |
146 | is($charinfo->{upper}, ''); | |
147 | is($charinfo->{lower}, ''); | |
148 | is($charinfo->{title}, ''); | |
149 | is($charinfo->{block}, 'Hangul Syllables'); | |
150 | is($charinfo->{script}, 'Hangul'); | |
a6fa416b | 151 | |
74f8133e | 152 | # A closed syllable in Hangul. |
a6fa416b TS |
153 | |
154 | $charinfo = charinfo(0xAE00); | |
155 | ||
05dbc6f8 KW |
156 | is($charinfo->{code}, 'AE00', 'HANGUL SYLLABLE U+AE00'); |
157 | is($charinfo->{name}, 'HANGUL SYLLABLE GEUL'); | |
f5c9f3db JH |
158 | is($charinfo->{category}, 'Lo'); |
159 | is($charinfo->{combining}, '0'); | |
160 | is($charinfo->{bidi}, 'L'); | |
05dbc6f8 | 161 | is($charinfo->{decomposition}, "1100 1173 11AF"); |
f5c9f3db JH |
162 | is($charinfo->{decimal}, ''); |
163 | is($charinfo->{digit}, ''); | |
164 | is($charinfo->{numeric}, ''); | |
165 | is($charinfo->{mirrored}, 'N'); | |
166 | is($charinfo->{unicode10}, ''); | |
167 | is($charinfo->{comment}, ''); | |
168 | is($charinfo->{upper}, ''); | |
169 | is($charinfo->{lower}, ''); | |
170 | is($charinfo->{title}, ''); | |
171 | is($charinfo->{block}, 'Hangul Syllables'); | |
172 | is($charinfo->{script}, 'Hangul'); | |
a6fa416b TS |
173 | |
174 | $charinfo = charinfo(0x1D400); | |
175 | ||
f5c9f3db JH |
176 | is($charinfo->{code}, '1D400', 'MATHEMATICAL BOLD CAPITAL A'); |
177 | is($charinfo->{name}, 'MATHEMATICAL BOLD CAPITAL A'); | |
178 | is($charinfo->{category}, 'Lu'); | |
179 | is($charinfo->{combining}, '0'); | |
180 | is($charinfo->{bidi}, 'L'); | |
181 | is($charinfo->{decomposition}, '<font> 0041'); | |
182 | is($charinfo->{decimal}, ''); | |
183 | is($charinfo->{digit}, ''); | |
184 | is($charinfo->{numeric}, ''); | |
185 | is($charinfo->{mirrored}, 'N'); | |
186 | is($charinfo->{unicode10}, ''); | |
187 | is($charinfo->{comment}, ''); | |
188 | is($charinfo->{upper}, ''); | |
189 | is($charinfo->{lower}, ''); | |
190 | is($charinfo->{title}, ''); | |
191 | is($charinfo->{block}, 'Mathematical Alphanumeric Symbols'); | |
7be0dac3 | 192 | is($charinfo->{script}, 'Common'); |
a6fa416b | 193 | |
a452d459 KW |
194 | $charinfo = charinfo(0x9FBA); #Bug 58428 |
195 | ||
196 | is($charinfo->{code}, '9FBA', 'U+9FBA'); | |
197 | is($charinfo->{name}, 'CJK UNIFIED IDEOGRAPH-9FBA'); | |
198 | is($charinfo->{category}, 'Lo'); | |
199 | is($charinfo->{combining}, '0'); | |
200 | is($charinfo->{bidi}, 'L'); | |
201 | is($charinfo->{decomposition}, ''); | |
202 | is($charinfo->{decimal}, ''); | |
203 | is($charinfo->{digit}, ''); | |
204 | is($charinfo->{numeric}, ''); | |
205 | is($charinfo->{mirrored}, 'N'); | |
206 | is($charinfo->{unicode10}, ''); | |
207 | is($charinfo->{comment}, ''); | |
208 | is($charinfo->{upper}, ''); | |
209 | is($charinfo->{lower}, ''); | |
210 | is($charinfo->{title}, ''); | |
211 | is($charinfo->{block}, 'CJK Unified Ideographs'); | |
212 | is($charinfo->{script}, 'Han'); | |
213 | ||
55d7b906 | 214 | use Unicode::UCD qw(charblock charscript); |
a196fbfd JH |
215 | |
216 | # 0x0590 is in the Hebrew block but unused. | |
561c79ed | 217 | |
f5c9f3db | 218 | is(charblock(0x590), 'Hebrew', '0x0590 - Hebrew unused charblock'); |
8079ad82 | 219 | is(charscript(0x590), 'Unknown', '0x0590 - Hebrew unused charscript'); |
c707cf8e | 220 | is(charblock(0x1FFFF), 'No_Block', '0x1FFFF - unused charblock'); |
561c79ed | 221 | |
b08cd201 JH |
222 | $charinfo = charinfo(0xbe); |
223 | ||
f5c9f3db JH |
224 | is($charinfo->{code}, '00BE', 'VULGAR FRACTION THREE QUARTERS'); |
225 | is($charinfo->{name}, 'VULGAR FRACTION THREE QUARTERS'); | |
226 | is($charinfo->{category}, 'No'); | |
227 | is($charinfo->{combining}, '0'); | |
228 | is($charinfo->{bidi}, 'ON'); | |
229 | is($charinfo->{decomposition}, '<fraction> 0033 2044 0034'); | |
230 | is($charinfo->{decimal}, ''); | |
231 | is($charinfo->{digit}, ''); | |
232 | is($charinfo->{numeric}, '3/4'); | |
233 | is($charinfo->{mirrored}, 'N'); | |
234 | is($charinfo->{unicode10}, 'FRACTION THREE QUARTERS'); | |
235 | is($charinfo->{comment}, ''); | |
236 | is($charinfo->{upper}, ''); | |
237 | is($charinfo->{lower}, ''); | |
238 | is($charinfo->{title}, ''); | |
239 | is($charinfo->{block}, 'Latin-1 Supplement'); | |
7be0dac3 | 240 | is($charinfo->{script}, 'Common'); |
10a6ecd2 | 241 | |
7fcb6f68 KW |
242 | # This is to test a case where both simple and full lowercases exist and |
243 | # differ | |
244 | $charinfo = charinfo(0x130); | |
245 | ||
246 | is($charinfo->{code}, '0130', 'LATIN CAPITAL LETTER I WITH DOT ABOVE'); | |
247 | is($charinfo->{name}, 'LATIN CAPITAL LETTER I WITH DOT ABOVE'); | |
248 | is($charinfo->{category}, 'Lu'); | |
249 | is($charinfo->{combining}, '0'); | |
250 | is($charinfo->{bidi}, 'L'); | |
251 | is($charinfo->{decomposition}, '0049 0307'); | |
252 | is($charinfo->{decimal}, ''); | |
253 | is($charinfo->{digit}, ''); | |
254 | is($charinfo->{numeric}, ''); | |
255 | is($charinfo->{mirrored}, 'N'); | |
256 | is($charinfo->{unicode10}, 'LATIN CAPITAL LETTER I DOT'); | |
257 | is($charinfo->{comment}, ''); | |
258 | is($charinfo->{upper}, ''); | |
259 | is($charinfo->{lower}, '0069'); | |
260 | is($charinfo->{title}, ''); | |
261 | is($charinfo->{block}, 'Latin Extended-A'); | |
262 | is($charinfo->{script}, 'Latin'); | |
263 | ||
264 | # This is to test a case where both simple and full uppercases exist and | |
265 | # differ | |
266 | $charinfo = charinfo(0x1F80); | |
267 | ||
268 | is($charinfo->{code}, '1F80', 'GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI'); | |
269 | is($charinfo->{name}, 'GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI'); | |
270 | is($charinfo->{category}, 'Ll'); | |
271 | is($charinfo->{combining}, '0'); | |
272 | is($charinfo->{bidi}, 'L'); | |
273 | is($charinfo->{decomposition}, '1F00 0345'); | |
274 | is($charinfo->{decimal}, ''); | |
275 | is($charinfo->{digit}, ''); | |
276 | is($charinfo->{numeric}, ''); | |
277 | is($charinfo->{mirrored}, 'N'); | |
278 | is($charinfo->{unicode10}, ''); | |
279 | is($charinfo->{comment}, ''); | |
280 | is($charinfo->{upper}, '1F88'); | |
281 | is($charinfo->{lower}, ''); | |
282 | is($charinfo->{title}, '1F88'); | |
283 | is($charinfo->{block}, 'Greek Extended'); | |
284 | is($charinfo->{script}, 'Greek'); | |
285 | ||
55d7b906 | 286 | use Unicode::UCD qw(charblocks charscripts); |
10a6ecd2 | 287 | |
b08cd201 | 288 | my $charblocks = charblocks(); |
10a6ecd2 | 289 | |
f5c9f3db JH |
290 | ok(exists $charblocks->{Thai}, 'Thai charblock exists'); |
291 | is($charblocks->{Thai}->[0]->[0], hex('0e00')); | |
292 | ok(!exists $charblocks->{PigLatin}, 'PigLatin charblock does not exist'); | |
10a6ecd2 | 293 | |
b08cd201 | 294 | my $charscripts = charscripts(); |
10a6ecd2 | 295 | |
f5c9f3db JH |
296 | ok(exists $charscripts->{Armenian}, 'Armenian charscript exists'); |
297 | is($charscripts->{Armenian}->[0]->[0], hex('0531')); | |
298 | ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist'); | |
10a6ecd2 JH |
299 | |
300 | my $charscript; | |
301 | ||
302 | $charscript = charscript("12ab"); | |
f5c9f3db | 303 | is($charscript, 'Ethiopic', 'Ethiopic charscript'); |
10a6ecd2 JH |
304 | |
305 | $charscript = charscript("0x12ab"); | |
f5c9f3db | 306 | is($charscript, 'Ethiopic'); |
10a6ecd2 JH |
307 | |
308 | $charscript = charscript("U+12ab"); | |
f5c9f3db | 309 | is($charscript, 'Ethiopic'); |
10a6ecd2 JH |
310 | |
311 | my $ranges; | |
312 | ||
313 | $ranges = charscript('Ogham'); | |
7bccef0b KW |
314 | is($ranges->[0]->[0], hex('1680'), 'Ogham charscript'); |
315 | is($ranges->[0]->[1], hex('169C')); | |
10a6ecd2 | 316 | |
55d7b906 | 317 | use Unicode::UCD qw(charinrange); |
10a6ecd2 JH |
318 | |
319 | $ranges = charscript('Cherokee'); | |
f5c9f3db | 320 | ok(!charinrange($ranges, "139f"), 'Cherokee charscript'); |
10a6ecd2 JH |
321 | ok( charinrange($ranges, "13a0")); |
322 | ok( charinrange($ranges, "13f4")); | |
323 | ok(!charinrange($ranges, "13f5")); | |
324 | ||
ea508aee JH |
325 | use Unicode::UCD qw(general_categories); |
326 | ||
327 | my $gc = general_categories(); | |
328 | ||
329 | ok(exists $gc->{L}, 'has L'); | |
330 | is($gc->{L}, 'Letter', 'L is Letter'); | |
331 | is($gc->{Lu}, 'UppercaseLetter', 'Lu is UppercaseLetter'); | |
332 | ||
333 | use Unicode::UCD qw(bidi_types); | |
334 | ||
335 | my $bt = bidi_types(); | |
336 | ||
337 | ok(exists $bt->{L}, 'has L'); | |
338 | is($bt->{L}, 'Left-to-Right', 'L is Left-to-Right'); | |
339 | is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic'); | |
340 | ||
a452d459 KW |
341 | # If this fails, then maybe one should look at the Unicode changes to see |
342 | # what else might need to be updated. | |
bd84d130 | 343 | is(Unicode::UCD::UnicodeVersion, '6.0.0', 'UnicodeVersion'); |
b08cd201 | 344 | |
55d7b906 | 345 | use Unicode::UCD qw(compexcl); |
b08cd201 | 346 | |
f5c9f3db | 347 | ok(!compexcl(0x0100), 'compexcl'); |
71a442a8 KW |
348 | ok(!compexcl(0xD801), 'compexcl of surrogate'); |
349 | ok(!compexcl(0x110000), 'compexcl of non-Unicode code point'); | |
b08cd201 JH |
350 | ok( compexcl(0x0958)); |
351 | ||
55d7b906 | 352 | use Unicode::UCD qw(casefold); |
b08cd201 JH |
353 | |
354 | my $casefold; | |
355 | ||
356 | $casefold = casefold(0x41); | |
357 | ||
a452d459 KW |
358 | is($casefold->{code}, '0041', 'casefold 0x41 code'); |
359 | is($casefold->{status}, 'C', 'casefold 0x41 status'); | |
360 | is($casefold->{mapping}, '0061', 'casefold 0x41 mapping'); | |
361 | is($casefold->{full}, '0061', 'casefold 0x41 full'); | |
362 | is($casefold->{simple}, '0061', 'casefold 0x41 simple'); | |
363 | is($casefold->{turkic}, "", 'casefold 0x41 turkic'); | |
b08cd201 JH |
364 | |
365 | $casefold = casefold(0xdf); | |
366 | ||
a452d459 KW |
367 | is($casefold->{code}, '00DF', 'casefold 0xDF code'); |
368 | is($casefold->{status}, 'F', 'casefold 0xDF status'); | |
369 | is($casefold->{mapping}, '0073 0073', 'casefold 0xDF mapping'); | |
370 | is($casefold->{full}, '0073 0073', 'casefold 0xDF full'); | |
371 | is($casefold->{simple}, "", 'casefold 0xDF simple'); | |
372 | is($casefold->{turkic}, "", 'casefold 0xDF turkic'); | |
373 | ||
374 | # Do different tests depending on if version <= 3.1, or not. | |
375 | (my $version = Unicode::UCD::UnicodeVersion) =~ /^(\d+)\.(\d+)/; | |
376 | if (defined $1 && ($1 <= 2 || $1 == 3 && defined $2 && $2 <= 1)) { | |
377 | $casefold = casefold(0x130); | |
378 | ||
379 | is($casefold->{code}, '0130', 'casefold 0x130 code'); | |
380 | is($casefold->{status}, 'I' , 'casefold 0x130 status'); | |
381 | is($casefold->{mapping}, '0069', 'casefold 0x130 mapping'); | |
382 | is($casefold->{full}, '0069', 'casefold 0x130 full'); | |
383 | is($casefold->{simple}, "0069", 'casefold 0x130 simple'); | |
384 | is($casefold->{turkic}, "0069", 'casefold 0x130 turkic'); | |
385 | ||
386 | $casefold = casefold(0x131); | |
387 | ||
388 | is($casefold->{code}, '0131', 'casefold 0x131 code'); | |
389 | is($casefold->{status}, 'I' , 'casefold 0x131 status'); | |
390 | is($casefold->{mapping}, '0069', 'casefold 0x131 mapping'); | |
391 | is($casefold->{full}, '0069', 'casefold 0x131 full'); | |
392 | is($casefold->{simple}, "0069", 'casefold 0x131 simple'); | |
393 | is($casefold->{turkic}, "0069", 'casefold 0x131 turkic'); | |
394 | } else { | |
395 | $casefold = casefold(0x49); | |
396 | ||
397 | is($casefold->{code}, '0049', 'casefold 0x49 code'); | |
398 | is($casefold->{status}, 'C' , 'casefold 0x49 status'); | |
399 | is($casefold->{mapping}, '0069', 'casefold 0x49 mapping'); | |
400 | is($casefold->{full}, '0069', 'casefold 0x49 full'); | |
401 | is($casefold->{simple}, "0069", 'casefold 0x49 simple'); | |
402 | is($casefold->{turkic}, "0131", 'casefold 0x49 turkic'); | |
403 | ||
404 | $casefold = casefold(0x130); | |
405 | ||
406 | is($casefold->{code}, '0130', 'casefold 0x130 code'); | |
407 | is($casefold->{status}, 'F' , 'casefold 0x130 status'); | |
408 | is($casefold->{mapping}, '0069 0307', 'casefold 0x130 mapping'); | |
409 | is($casefold->{full}, '0069 0307', 'casefold 0x130 full'); | |
410 | is($casefold->{simple}, "", 'casefold 0x130 simple'); | |
411 | is($casefold->{turkic}, "0069", 'casefold 0x130 turkic'); | |
412 | } | |
413 | ||
414 | $casefold = casefold(0x1F88); | |
415 | ||
416 | is($casefold->{code}, '1F88', 'casefold 0x1F88 code'); | |
417 | is($casefold->{status}, 'S' , 'casefold 0x1F88 status'); | |
418 | is($casefold->{mapping}, '1F80', 'casefold 0x1F88 mapping'); | |
419 | is($casefold->{full}, '1F00 03B9', 'casefold 0x1F88 full'); | |
420 | is($casefold->{simple}, '1F80', 'casefold 0x1F88 simple'); | |
421 | is($casefold->{turkic}, "", 'casefold 0x1F88 turkic'); | |
b08cd201 JH |
422 | |
423 | ok(!casefold(0x20)); | |
424 | ||
55d7b906 | 425 | use Unicode::UCD qw(casespec); |
b08cd201 JH |
426 | |
427 | my $casespec; | |
428 | ||
429 | ok(!casespec(0x41)); | |
430 | ||
431 | $casespec = casespec(0xdf); | |
432 | ||
433 | ok($casespec->{code} eq '00DF' && | |
434 | $casespec->{lower} eq '00DF' && | |
435 | $casespec->{title} eq '0053 0073' && | |
436 | $casespec->{upper} eq '0053 0053' && | |
2d3cf3ee | 437 | !defined $casespec->{condition}, 'casespec 0xDF'); |
b08cd201 JH |
438 | |
439 | $casespec = casespec(0x307); | |
440 | ||
f499c386 | 441 | ok($casespec->{az}->{code} eq '0307' && |
2d3cf3ee | 442 | !defined $casespec->{az}->{lower} && |
f499c386 JH |
443 | $casespec->{az}->{title} eq '0307' && |
444 | $casespec->{az}->{upper} eq '0307' && | |
9c3dc587 | 445 | $casespec->{az}->{condition} eq 'az After_I', |
f5c9f3db | 446 | 'casespec 0x307'); |
6c8d78fb HS |
447 | |
448 | # perl #7305 UnicodeCD::compexcl is weird | |
449 | ||
2d3cf3ee | 450 | for (1) {my $a=compexcl $_} |
6c8d78fb | 451 | ok(1, 'compexcl read-only $_: perl #7305'); |
1f27373c | 452 | map {compexcl $_} %{{1=>2}}; |
6c8d78fb HS |
453 | ok(1, 'compexcl read-only hash: perl #7305'); |
454 | ||
d7829152 JH |
455 | is(Unicode::UCD::_getcode('123'), 123, "_getcode(123)"); |
456 | is(Unicode::UCD::_getcode('0123'), 0x123, "_getcode(0123)"); | |
457 | is(Unicode::UCD::_getcode('0x123'), 0x123, "_getcode(0x123)"); | |
458 | is(Unicode::UCD::_getcode('0X123'), 0x123, "_getcode(0X123)"); | |
459 | is(Unicode::UCD::_getcode('U+123'), 0x123, "_getcode(U+123)"); | |
460 | is(Unicode::UCD::_getcode('u+123'), 0x123, "_getcode(u+123)"); | |
461 | is(Unicode::UCD::_getcode('U+1234'), 0x1234, "_getcode(U+1234)"); | |
462 | is(Unicode::UCD::_getcode('U+12345'), 0x12345, "_getcode(U+12345)"); | |
463 | is(Unicode::UCD::_getcode('123x'), undef, "_getcode(123x)"); | |
464 | is(Unicode::UCD::_getcode('x123'), undef, "_getcode(x123)"); | |
465 | is(Unicode::UCD::_getcode('0x123x'), undef, "_getcode(x123)"); | |
466 | is(Unicode::UCD::_getcode('U+123x'), undef, "_getcode(x123)"); | |
741297c1 JH |
467 | |
468 | { | |
469 | my $r1 = charscript('Latin'); | |
470 | my $n1 = @$r1; | |
7bccef0b | 471 | is($n1, 30, "number of ranges in Latin script (Unicode 6.0.0)"); |
741297c1 JH |
472 | shift @$r1 while @$r1; |
473 | my $r2 = charscript('Latin'); | |
474 | is(@$r2, $n1, "modifying results should not mess up internal caches"); | |
475 | } | |
476 | ||
c5a29f40 LM |
477 | { |
478 | is(charinfo(0xdeadbeef), undef, "[perl #23273] warnings in Unicode::UCD"); | |
2d3cf3ee | 479 | } |
a2bd7410 JH |
480 | |
481 | use Unicode::UCD qw(namedseq); | |
482 | ||
483 | is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq"); | |
484 | is(namedseq("KATAKANA LETTER AINU Q"), undef); | |
485 | is(namedseq(), undef); | |
486 | is(namedseq(qw(foo bar)), undef); | |
487 | my @ns = namedseq("KATAKANA LETTER AINU P"); | |
488 | is(scalar @ns, 2); | |
489 | is($ns[0], 0x31F7); | |
490 | is($ns[1], 0x309A); | |
491 | my %ns = namedseq(); | |
492 | is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}"); | |
493 | @ns = namedseq(42); | |
494 | is(@ns, 0); | |
495 | ||
7319f91d KW |
496 | use Unicode::UCD qw(num); |
497 | use charnames ":full"; | |
498 | ||
499 | is(num("0"), 0, 'Verify num("0") == 0'); | |
500 | is(num("98765"), 98765, 'Verify num("98765") == 98765'); | |
501 | ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined'); | |
c4b3d89d KW |
502 | is(num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}"), 21, 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}") == 21'); |
503 | ok(! defined num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}"), 'Verify num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}") isnt defined'); | |
7319f91d KW |
504 | is(num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}"), 3, 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3'); |
505 | ok(! defined num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}"), 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined'); | |
506 | is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2'); | |
507 | is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000'); | |
508 | is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5'); | |
98025745 | 509 | is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9'); |
eaebe4db | 510 | |
7ef25837 KW |
511 | # Create a user-defined property |
512 | sub InKana {<<'END'} | |
513 | 3040 309F | |
514 | 30A0 30FF | |
515 | END | |
516 | ||
517 | use Unicode::UCD qw(prop_aliases); | |
518 | ||
519 | is(prop_aliases(undef), undef, "prop_aliases(undef) returns <undef>"); | |
520 | is(prop_aliases("unknown property"), undef, | |
521 | "prop_aliases(<unknown property>) returns <undef>"); | |
522 | is(prop_aliases("InKana"), undef, | |
523 | "prop_aliases(<user-defined property>) returns <undef>"); | |
524 | is(prop_aliases("Perl_Decomposition_Mapping"), undef, "prop_aliases('Perl_Decomposition_Mapping') returns <undef> since internal-Perl-only"); | |
525 | is(prop_aliases("Perl_Charnames"), undef, | |
526 | "prop_aliases('Perl_Charnames') returns <undef> since internal-Perl-only"); | |
527 | is(prop_aliases("isgc"), undef, | |
528 | "prop_aliases('isgc') returns <undef> since is not covered Perl extension"); | |
529 | is(prop_aliases("Is_Is_Any"), undef, | |
530 | "prop_aliases('Is_Is_Any') returns <undef> since two is's"); | |
531 | ||
532 | require 'utf8_heavy.pl'; | |
533 | require "unicore/Heavy.pl"; | |
534 | ||
535 | # Keys are lists of properties. Values are defined if have been tested. | |
536 | my %props; | |
537 | ||
538 | # To test for loose matching, add in the characters that are ignored there. | |
539 | my $extra_chars = "-_ "; | |
540 | ||
541 | # The one internal property we accept | |
542 | $props{'Perl_Decimal_Digit'} = 1; | |
543 | my @list = prop_aliases("perldecimaldigit"); | |
544 | is_deeply(\@list, | |
545 | [ "Perl_Decimal_Digit", | |
546 | "Perl_Decimal_Digit" | |
547 | ], "prop_aliases('perldecimaldigit') returns Perl_Decimal_Digit as both short and full names"); | |
548 | ||
549 | # Get the official Unicode property name synonyms and test them. | |
550 | open my $props, "<", "../lib/unicore/PropertyAliases.txt" | |
551 | or die "Can't open Unicode PropertyAliases.txt"; | |
552 | while (<$props>) { | |
553 | s/\s*#.*//; # Remove comments | |
554 | next if /^\s* $/x; # Ignore empty and comment lines | |
555 | ||
556 | chomp; | |
557 | my $count = 0; # 0th field in line is short name; 1th is long name | |
558 | my $short_name; | |
559 | my $full_name; | |
560 | my @names_via_short; | |
561 | foreach my $alias (split /\s*;\s*/) { # Fields are separated by | |
562 | # semi-colons | |
563 | # Add in the characters that are supposed to be ignored, to test loose | |
564 | # matching, which the tested function does on all inputs. | |
565 | my $mod_name = "$extra_chars$alias"; | |
566 | ||
567 | my $loose = utf8::_loose_name(lc $alias); | |
568 | ||
569 | # Indicate we have tested this. | |
570 | $props{$loose} = 1; | |
571 | ||
572 | my @all_names = prop_aliases($mod_name); | |
573 | if (grep { $_ eq $loose } @Unicode::UCD::suppressed_properties) { | |
574 | is(@all_names, 0, "prop_aliases('$mod_name') returns undef since $alias is not installed"); | |
575 | next; | |
576 | } | |
577 | elsif (! @all_names) { | |
578 | fail("prop_aliases('$mod_name')"); | |
579 | diag("'$alias' is unknown to prop_aliases()"); | |
580 | next; | |
581 | } | |
582 | ||
583 | if ($count == 0) { # Is short name | |
584 | ||
585 | @names_via_short = prop_aliases($mod_name); | |
586 | ||
587 | # If the 0th test fails, no sense in continuing with the others | |
588 | last unless is($names_via_short[0], $alias, | |
589 | "prop_aliases: '$alias' is the short name for '$mod_name'"); | |
590 | $short_name = $alias; | |
591 | } | |
592 | elsif ($count == 1) { # Is full name | |
593 | ||
594 | # Some properties have the same short and full name; no sense | |
595 | # repeating the test if the same. | |
596 | if ($alias ne $short_name) { | |
597 | my @names_via_full = prop_aliases($mod_name); | |
598 | is_deeply(\@names_via_full, \@names_via_short, "prop_aliases() returns the same list for both '$short_name' and '$mod_name'"); | |
599 | } | |
600 | ||
601 | # Tests scalar context | |
602 | is(prop_aliases($short_name), $alias, | |
603 | "prop_aliases: '$alias' is the long name for '$short_name'"); | |
604 | } | |
605 | else { # Is another alias | |
606 | is_deeply(\@all_names, \@names_via_short, "prop_aliases() returns the same list for both '$short_name' and '$mod_name'"); | |
607 | ok((grep { $_ =~ /^$alias$/i } @all_names), | |
608 | "prop_aliases: '$alias' is listed as an alias for '$mod_name'"); | |
609 | } | |
610 | ||
611 | $count++; | |
612 | } | |
613 | } | |
614 | ||
615 | # Now test anything we can find that wasn't covered by the tests of the | |
616 | # official properties. We have no way of knowing if mktables omitted a Perl | |
617 | # extension or not, but we do the best we can from its generated lists | |
618 | ||
619 | foreach my $alias (keys %utf8::loose_to_file_of) { | |
620 | next if $alias =~ /=/; | |
621 | my $lc_name = lc $alias; | |
622 | my $loose = utf8::_loose_name($lc_name); | |
623 | next if exists $props{$loose}; # Skip if already tested | |
624 | $props{$loose} = 1; | |
625 | my $mod_name = "$extra_chars$alias"; # Tests loose matching | |
626 | my @aliases = prop_aliases($mod_name); | |
627 | my $found_it = grep { utf8::_loose_name(lc $_) eq $lc_name } @aliases; | |
628 | if ($found_it) { | |
629 | pass("prop_aliases: '$lc_name' is listed as an alias for '$mod_name'"); | |
630 | } | |
631 | elsif ($lc_name =~ /l[_&]$/) { | |
632 | ||
633 | # These two names are special in that they don't appear in the | |
634 | # returned list because they are discouraged from use. Verify | |
635 | # that they return the same list as a non-discouraged version. | |
636 | my @LC = prop_aliases('Is_LC'); | |
637 | is_deeply(\@aliases, \@LC, "prop_aliases: '$lc_name' returns the same list as 'Is_LC'"); | |
638 | } | |
639 | else { | |
640 | my $stripped = $lc_name =~ s/^is//; | |
641 | ||
642 | # Could be that the input includes a prefix 'is', which is rarely | |
643 | # returned as an alias, so having successfully stripped it off above, | |
644 | # try again. | |
645 | if ($stripped) { | |
646 | $found_it = grep { utf8::_loose_name(lc $_) eq $lc_name } @aliases; | |
647 | } | |
648 | ||
649 | # If that didn't work, it could be that it's a block, which is always | |
650 | # returned with a leading 'In_' to avoid ambiguity. Try comparing | |
651 | # with that stripped off. | |
652 | if (! $found_it) { | |
653 | $found_it = grep { utf8::_loose_name(s/^In_(.*)/\L$1/r) eq $lc_name } | |
654 | @aliases; | |
655 | # Could check that is a real block, but tests for invmap will | |
656 | # likely pickup any errors, since this will be tested there. | |
657 | $lc_name = "in$lc_name" if $found_it; # Change for message below | |
658 | } | |
659 | my $message = "prop_aliases: '$lc_name' is listed as an alias for '$mod_name'"; | |
660 | ($found_it) ? pass($message) : fail($message); | |
661 | } | |
662 | } | |
663 | ||
664 | my $done_equals = 0; | |
665 | foreach my $alias (keys %utf8::stricter_to_file_of) { | |
666 | if ($alias =~ /=/) { # Only test one case where there is an equals | |
667 | next if $done_equals; | |
668 | $done_equals = 1; | |
669 | } | |
670 | my $lc_name = lc $alias; | |
671 | my @list = prop_aliases($alias); | |
672 | if ($alias =~ /^_/) { | |
673 | is(@list, 0, "prop_aliases: '$lc_name' returns an empty list since it is internal_only"); | |
674 | } | |
675 | elsif ($alias =~ /=/) { | |
676 | is(@list, 0, "prop_aliases: '$lc_name' returns an empty list since is illegal property name"); | |
677 | } | |
678 | else { | |
679 | ok((grep { lc $_ eq $lc_name } @list), | |
680 | "prop_aliases: '$lc_name' is listed as an alias for '$alias'"); | |
681 | } | |
682 | } | |
683 | ||
684 | use Unicode::UCD qw(prop_value_aliases); | |
685 | ||
686 | is(prop_value_aliases("unknown property", "unknown value"), undef, | |
687 | "prop_value_aliases(<unknown property>, <unknown value>) returns <undef>"); | |
688 | is(prop_value_aliases(undef, undef), undef, | |
689 | "prop_value_aliases(undef, undef) returns <undef>"); | |
690 | is((prop_value_aliases("na", "A")), "A", "test that prop_value_aliases returns its input for properties that don't have synonyms"); | |
691 | is(prop_value_aliases("isgc", "C"), undef, "prop_value_aliases('isgc', 'C') returns <undef> since is not covered Perl extension"); | |
692 | is(prop_value_aliases("gc", "isC"), undef, "prop_value_aliases('gc', 'isC') returns <undef> since is not covered Perl extension"); | |
693 | ||
694 | # We have no way of knowing if mktables omitted a Perl extension that it | |
695 | # shouldn't have, but we can check if it omitted an official Unicode property | |
696 | # name synonym. And for those, we can check if the short and full names are | |
697 | # correct. | |
698 | ||
699 | my %pva_tested; # List of things already tested. | |
700 | open my $propvalues, "<", "../lib/unicore/PropValueAliases.txt" | |
701 | or die "Can't open Unicode PropValueAliases.txt"; | |
702 | while (<$propvalues>) { | |
703 | s/\s*#.*//; # Remove comments | |
704 | next if /^\s* $/x; # Ignore empty and comment lines | |
705 | chomp; | |
706 | ||
707 | my @fields = split /\s*;\s*/; # Fields are separated by semi-colons | |
708 | my $prop = shift @fields; # 0th field is the property, | |
709 | my $count = 0; # 0th field in line (after shifting off the property) is | |
710 | # short name; 1th is long name | |
711 | my $short_name; | |
712 | my @names_via_short; # Saves the values between iterations | |
713 | ||
714 | # The property on the lhs of the = is always loosely matched. Add in | |
715 | # characters that are ignored under loose matching to test that | |
716 | my $mod_prop = "$extra_chars$prop"; | |
717 | ||
718 | if ($fields[0] eq 'n/a') { # See comments in input file, essentially | |
719 | # means full name and short name are identical | |
720 | $fields[0] = $fields[1]; | |
721 | } | |
722 | elsif ($fields[0] ne $fields[1] | |
723 | && utf8::_loose_name(lc $fields[0]) | |
724 | eq utf8::_loose_name(lc $fields[1]) | |
725 | && $fields[1] !~ /[[:upper:]]/) | |
726 | { | |
727 | # Also, there is a bug in the file in which "n/a" is omitted, and | |
728 | # the two fields are identical except for case, and the full name | |
729 | # is all lower case. Copy the "short" name unto the full one to | |
730 | # give it some upper case. | |
731 | ||
732 | $fields[1] = $fields[0]; | |
733 | } | |
734 | ||
735 | # The ccc property in the file is special; has an extra numeric field | |
736 | # (0th), which should go at the end, since we use the next two fields as | |
737 | # the short and full names, respectively. See comments in input file. | |
738 | splice (@fields, 0, 0, splice(@fields, 1, 2)) if $prop eq 'ccc'; | |
739 | ||
740 | my $loose_prop = utf8::_loose_name(lc $prop); | |
741 | my $suppressed = grep { $_ eq $loose_prop } | |
742 | @Unicode::UCD::suppressed_properties; | |
743 | foreach my $value (@fields) { | |
744 | if ($suppressed) { | |
745 | is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop"); | |
746 | next; | |
747 | } | |
748 | elsif (grep { $_ eq ("$loose_prop=" . utf8::_loose_name(lc $value)) } @Unicode::UCD::suppressed_properties) { | |
749 | is(prop_value_aliases($prop, $value), undef, "prop_value_aliases('$prop', '$value') returns undef for suppressed property $prop=$value"); | |
750 | next; | |
751 | } | |
752 | ||
753 | # Add in test for loose matching. | |
754 | my $mod_value = "$extra_chars$value"; | |
755 | ||
756 | # If the value is a number, optionally negative, including a floating | |
757 | # point or rational numer, it should be only strictly matched, so the | |
758 | # loose matching should fail. | |
759 | if ($value =~ / ^ -? \d+ (?: [\/.] \d+ )? $ /x) { | |
760 | is(prop_value_aliases($mod_prop, $mod_value), undef, "prop_value_aliases('$mod_prop', '$mod_value') returns undef because '$mod_value' should be strictly matched"); | |
761 | ||
762 | # And reset so below tests just the strict matching. | |
763 | $mod_value = $value; | |
764 | } | |
765 | ||
766 | if ($count == 0) { | |
767 | ||
768 | @names_via_short = prop_value_aliases($mod_prop, $mod_value); | |
769 | ||
770 | # If the 0th test fails, no sense in continuing with the others | |
771 | last unless is($names_via_short[0], $value, "prop_value_aliases: In '$prop', '$value' is the short name for '$mod_value'"); | |
772 | $short_name = $value; | |
773 | } | |
774 | elsif ($count == 1) { | |
775 | ||
776 | # Some properties have the same short and full name; no sense | |
777 | # repeating the test if the same. | |
778 | if ($value ne $short_name) { | |
779 | my @names_via_full = | |
780 | prop_value_aliases($mod_prop, $mod_value); | |
781 | is_deeply(\@names_via_full, \@names_via_short, "In '$prop', prop_value_aliases() returns the same list for both '$short_name' and '$mod_value'"); | |
782 | } | |
783 | ||
784 | # Tests scalar context | |
785 | is(prop_value_aliases($prop, $short_name), $value, "'$value' is the long name for prop_value_aliases('$prop', '$short_name')"); | |
786 | } | |
787 | else { | |
788 | my @all_names = prop_value_aliases($mod_prop, $mod_value); | |
789 | is_deeply(\@all_names, \@names_via_short, "In '$prop', prop_value_aliases() returns the same list for both '$short_name' and '$mod_value'"); | |
790 | ok((grep { utf8::_loose_name(lc $_) eq utf8::_loose_name(lc $value) } prop_value_aliases($prop, $short_name)), "'$value' is listed as an alias for prop_value_aliases('$prop', '$short_name')"); | |
791 | } | |
792 | ||
793 | $pva_tested{utf8::_loose_name(lc $prop) . "=" . utf8::_loose_name(lc $value)} = 1; | |
794 | $count++; | |
795 | } | |
796 | } | |
797 | ||
798 | # And test as best we can, the non-official pva's that mktables generates. | |
799 | foreach my $hash (\%utf8::loose_to_file_of, \%utf8::stricter_to_file_of) { | |
800 | foreach my $test (keys %$hash) { | |
801 | next if exists $pva_tested{$test}; # Skip if already tested | |
802 | ||
803 | my ($prop, $value) = split "=", $test; | |
804 | next unless defined $value; # prop_value_aliases() requires an input | |
805 | # 'value' | |
806 | my $mod_value; | |
807 | if ($hash == \%utf8::loose_to_file_of) { | |
808 | ||
809 | # Add extra characters to test loose-match rhs value | |
810 | $mod_value = "$extra_chars$value"; | |
811 | } | |
812 | else { # Here value is strictly matched. | |
813 | ||
814 | # Extra elements are added by mktables to this hash so that | |
815 | # something like "age=6.0" has a synonym of "age=6". It's not | |
816 | # clear to me (khw) if we should be encouraging those synonyms, so | |
817 | # don't test for them. | |
818 | next if $value !~ /\D/ && exists $hash->{"$prop=$value.0"}; | |
819 | ||
820 | # Verify that loose matching fails when only strict is called for. | |
821 | next unless is(prop_value_aliases($prop, "$extra_chars$value"), undef, | |
822 | "prop_value_aliases('$prop', '$extra_chars$value') returns undef since '$value' should be strictly matched"), | |
823 | ||
824 | # Strict matching does allow for underscores between digits. Test | |
825 | # for that. | |
826 | $mod_value = $value; | |
827 | while ($mod_value =~ s/(\d)(\d)/$1_$2/g) {} | |
828 | } | |
829 | ||
830 | # The lhs property is always loosely matched, so add in extra | |
831 | # characters to test that. | |
832 | my $mod_prop = "$extra_chars$prop"; | |
833 | ||
834 | if ($prop eq 'gc' && $value =~ /l[_&]$/) { | |
835 | # These two names are special in that they don't appear in the | |
836 | # returned list because they are discouraged from use. Verify | |
837 | # that they return the same list as a non-discouraged version. | |
838 | my @LC = prop_value_aliases('gc', 'lc'); | |
839 | my @l_ = prop_value_aliases($mod_prop, $mod_value); | |
840 | is_deeply(\@l_, \@LC, "prop_value_aliases('$mod_prop', '$mod_value) returns the same list as prop_value_aliases('gc', 'lc')"); | |
841 | } | |
842 | else { | |
843 | ok((grep { utf8::_loose_name(lc $_) eq utf8::_loose_name(lc $value) } | |
844 | prop_value_aliases($mod_prop, $mod_value)), | |
845 | "'$value' is listed as an alias for prop_value_aliases('$mod_prop', '$mod_value')"); | |
846 | } | |
847 | } | |
848 | } | |
849 | ||
850 | undef %pva_tested; | |
851 | ||
681d705c KW |
852 | no warnings 'once'; # We use some values once from 'required' modules. |
853 | ||
62b3b855 KW |
854 | use Unicode::UCD qw(prop_invlist prop_invmap MAX_CP); |
855 | ||
856 | # There were some problems with caching interfering with prop_invlist() vs | |
857 | # prop_invmap() on binary properties, and also between the 3 properties where | |
858 | # Perl used the same 'To' name as another property (see utf8_heavy.pl). | |
859 | # So, before testing all of prop_invlist(), | |
860 | # 1) call prop_invmap() to try both orders of these name issues. This uses | |
861 | # up two of the 3 properties; the third will be left so that invlist() | |
862 | # on it gets called before invmap() | |
863 | # 2) call prop_invmap() on a generic binary property, ahead of invlist(). | |
864 | # This should test that the caching works in both directions. | |
865 | ||
866 | # These properties are not stable between Unicode versions, but the first few | |
867 | # elements are; just look at the first element to see if are getting the | |
868 | # distinction right. The general inversion map testing below will test the | |
869 | # whole thing. | |
870 | my $prop = "uc"; | |
871 | my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); | |
872 | is($format, 'cl', "prop_invmap() format of '$prop' is 'cl'"); | |
873 | is($missing, '<code point>', "prop_invmap() missing of '$prop' is '<code point>'"); | |
874 | is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61"); | |
875 | is($invmap_ref->[1], 0x41, "prop_invmap('$prop') map[1] is 0x41"); | |
876 | ||
877 | $prop = "upper"; | |
878 | ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); | |
879 | is($format, 's', "prop_invmap() format of '$prop' is 'cl'"); | |
880 | is($missing, 'N', "prop_invmap() missing of '$prop' is '<code point>'"); | |
881 | is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41"); | |
882 | is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'"); | |
883 | ||
884 | $prop = "lower"; | |
885 | ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); | |
886 | is($format, 's', "prop_invmap() format of '$prop' is 'cl'"); | |
887 | is($missing, 'N', "prop_invmap() missing of '$prop' is '<code point>'"); | |
888 | is($invlist_ref->[1], 0x61, "prop_invmap('$prop') list[1] is 0x61"); | |
889 | is($invmap_ref->[1], 'Y', "prop_invmap('$prop') map[1] is 'Y'"); | |
890 | ||
891 | $prop = "lc"; | |
892 | ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); | |
893 | is($format, 'cl', "prop_invmap() format of '$prop' is 'cl'"); | |
894 | is($missing, '<code point>', "prop_invmap() missing of '$prop' is '<code point>'"); | |
895 | is($invlist_ref->[1], 0x41, "prop_invmap('$prop') list[1] is 0x41"); | |
896 | is($invmap_ref->[1], 0x61, "prop_invmap('$prop') map[1] is 0x61"); | |
897 | ||
898 | # This property is stable and small, so can test all of it | |
899 | $prop = "ASCII_Hex_Digit"; | |
900 | ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($prop); | |
901 | is($format, 's', "prop_invmap() format of '$prop' is 's'"); | |
902 | is($missing, 'N', "prop_invmap() missing of '$prop' is 'N'"); | |
903 | is_deeply($invlist_ref, [ 0x0000, 0x0030, 0x003A, 0x0041, | |
904 | 0x0047, 0x0061, 0x0067, 0x110000 ], | |
905 | "prop_invmap('$prop') code point list is correct"); | |
906 | is_deeply($invmap_ref, [ 'N', 'Y', 'N', 'Y', 'N', 'Y', 'N', 'N' ] , | |
907 | "prop_invmap('$prop') map list is correct"); | |
681d705c KW |
908 | |
909 | is(prop_invlist("Unknown property"), undef, "prop_invlist(<Unknown property>) returns undef"); | |
910 | is(prop_invlist(undef), undef, "prop_invlist(undef) returns undef"); | |
911 | is(prop_invlist("Any"), 2, "prop_invlist('Any') returns the number of elements in scalar context"); | |
912 | my @invlist = prop_invlist("Is_Any"); | |
913 | is_deeply(\@invlist, [ 0, 0x110000 ], "prop_invlist works on 'Is_' prefixes"); | |
914 | is(prop_invlist("Is_Is_Any"), undef, "prop_invlist('Is_Is_Any') returns <undef> since two is's"); | |
915 | ||
916 | use Storable qw(dclone); | |
917 | ||
918 | is(prop_invlist("InKana"), undef, "prop_invlist(<user-defined property returns undef>)"); | |
919 | ||
62b3b855 | 920 | # The way both the tests for invlist and invmap work is that they take the |
681d705c KW |
921 | # lists returned by the functions and construct from them what the original |
922 | # file should look like, which are then compared with the file. If they are | |
923 | # identical, the test passes. What this tests isn't that the results are | |
62b3b855 | 924 | # correct, but that invlist and invmap haven't introduced errors beyond what |
681d705c KW |
925 | # are there in the files. As a small hedge against that, test some |
926 | # prop_invlist() tables fully with the known correct result. We choose | |
927 | # ASCII_Hex_Digit again, as it is stable. | |
928 | @invlist = prop_invlist("AHex"); | |
929 | is_deeply(\@invlist, [ 0x0030, 0x003A, 0x0041, | |
930 | 0x0047, 0x0061, 0x0067 ], | |
931 | "prop_invlist('AHex') is exactly the expected set of points"); | |
932 | @invlist = prop_invlist("AHex=f"); | |
933 | is_deeply(\@invlist, [ 0x0000, 0x0030, 0x003A, 0x0041, | |
934 | 0x0047, 0x0061, 0x0067 ], | |
935 | "prop_invlist('AHex=f') is exactly the expected set of points"); | |
936 | ||
937 | sub fail_with_diff ($$$$) { | |
938 | # For use below to output better messages | |
939 | my ($prop, $official, $constructed, $tested_function_name) = @_; | |
940 | ||
941 | is($constructed, $official, "$tested_function_name('$prop')"); | |
942 | diag("Comment out lines " . (__LINE__ - 1) . " through " . (__LINE__ + 1) . " in '$0' on Un*x-like systems to see just the differences. Uses the 'diff' first in your \$PATH"); | |
943 | return; | |
944 | ||
945 | fail("$tested_function_name('$prop')"); | |
946 | ||
947 | require File::Temp; | |
948 | my $off = File::Temp->new(); | |
949 | chomp $official; | |
950 | print $off $official, "\n"; | |
951 | close $off || die "Can't close official"; | |
952 | ||
953 | chomp $constructed; | |
954 | my $gend = File::Temp->new(); | |
955 | print $gend $constructed, "\n"; | |
956 | close $gend || die "Can't close gend"; | |
957 | ||
958 | my $diff = File::Temp->new(); | |
959 | system("diff $off $gend > $diff"); | |
960 | ||
961 | open my $fh, "<", $diff || die "Can't open $diff"; | |
962 | my @diffs = <$fh>; | |
963 | diag("In the diff output below '<' marks lines from the filesystem tables;\n'>' are from $tested_function_name()"); | |
964 | diag(@diffs); | |
965 | } | |
966 | ||
967 | my %tested_invlist; | |
968 | ||
969 | # Look at everything we think that mktables tells us exists, both loose and | |
970 | # strict | |
971 | foreach my $set_of_tables (\%utf8::stricter_to_file_of, \%utf8::loose_to_file_of) | |
972 | { | |
973 | foreach my $table (keys %$set_of_tables) { | |
974 | ||
975 | my $mod_table; | |
976 | my ($prop_only, $value) = split "=", $table; | |
977 | if (defined $value) { | |
978 | ||
979 | # If this is to be loose matched, add in characters to test that. | |
980 | if ($set_of_tables == \%utf8::loose_to_file_of) { | |
981 | $value = "$extra_chars$value"; | |
982 | } | |
983 | else { # Strict match | |
984 | ||
985 | # Verify that loose matching fails when only strict is called | |
986 | # for. | |
987 | next unless is(prop_invlist("$prop_only=$extra_chars$value"), undef, "prop_invlist('$prop_only=$extra_chars$value') returns undef since should be strictly matched"); | |
988 | ||
989 | # Strict matching does allow for underscores between digits. | |
990 | # Test for that. | |
991 | while ($value =~ s/(\d)(\d)/$1_$2/g) {} | |
992 | } | |
993 | ||
994 | # The property portion in compound form specifications always | |
995 | # matches loosely | |
996 | $mod_table = "$extra_chars$prop_only = $value"; | |
997 | } | |
998 | else { # Single-form. | |
999 | ||
1000 | # Like above, use looose if required, and insert underscores | |
1001 | # between digits if strict. | |
1002 | if ($set_of_tables == \%utf8::loose_to_file_of) { | |
1003 | $mod_table = "$extra_chars$table"; | |
1004 | } | |
1005 | else { | |
1006 | $mod_table = $table; | |
1007 | while ($mod_table =~ s/(\d)(\d)/$1_$2/g) {} | |
1008 | } | |
1009 | } | |
1010 | ||
1011 | my @tested = prop_invlist($mod_table); | |
1012 | if ($table =~ /^_/) { | |
1013 | is(@tested, 0, "prop_invlist('$mod_table') returns an empty list since is internal-only"); | |
1014 | next; | |
1015 | } | |
1016 | ||
1017 | # If we have already tested a property that uses the same file, this | |
1018 | # list should be identical to the one that was tested, and can bypass | |
1019 | # everything else. | |
1020 | my $file = $set_of_tables->{$table}; | |
1021 | if (exists $tested_invlist{$file}) { | |
1022 | is_deeply(\@tested, $tested_invlist{$file}, "prop_invlist('$mod_table') gave same results as its name synonym"); | |
1023 | next; | |
1024 | } | |
1025 | $tested_invlist{$file} = dclone \@tested; | |
1026 | ||
1027 | # A leading '!' in the file name means that it is to be inverted. | |
1028 | my $invert = $file =~ s/^!//; | |
1029 | my $official = do "unicore/lib/$file.pl"; | |
1030 | ||
1031 | # Get rid of any trailing space and comments in the file. | |
1032 | $official =~ s/\s*(#.*)?$//mg; | |
1033 | chomp $official; | |
1034 | ||
1035 | # If we are to test against an inverted file, it is easier to invert | |
1036 | # our array than the file. | |
1037 | # The file only is valid for Unicode code points, while the inversion | |
1038 | # list is valid for all possible code points. Therefore, we must test | |
1039 | # just the Unicode part against the file. Later we will test for | |
1040 | # the non-Unicode part. | |
1041 | ||
1042 | my $before_invert; # Saves the pre-inverted table. | |
1043 | if ($invert) { | |
1044 | $before_invert = dclone \@tested; | |
1045 | if (@tested && $tested[0] == 0) { | |
1046 | shift @tested; | |
1047 | } else { | |
1048 | unshift @tested, 0; | |
1049 | } | |
1050 | if (@tested && $tested[-1] == 0x110000) { | |
1051 | pop @tested; | |
1052 | } | |
1053 | else { | |
1054 | push @tested, 0x110000; | |
1055 | } | |
1056 | } | |
1057 | ||
1058 | # Now construct a string from the list that should match the file. | |
1059 | # The file gives ranges of code points with starting and ending values | |
1060 | # in hex, like this: | |
1061 | # 0041\t005A | |
1062 | # 0061\t007A | |
1063 | # 00AA | |
1064 | # Our list has even numbered elements start ranges that are in the | |
1065 | # list, and odd ones that aren't in the list. Therefore the odd | |
1066 | # numbered ones are one beyond the end of the previous range, but | |
1067 | # otherwise don't get reflected in the file. | |
1068 | my $tested = ""; | |
1069 | my $i = 0; | |
1070 | for (; $i < @tested - 1; $i += 2) { | |
1071 | my $start = $tested[$i]; | |
1072 | my $end = $tested[$i+1] - 1; | |
1073 | if ($start == $end) { | |
1074 | $tested .= sprintf("%04X\n", $start); | |
1075 | } | |
1076 | else { | |
1077 | $tested .= sprintf "%04X\t%04X\n", $start, $end; | |
1078 | } | |
1079 | } | |
1080 | ||
1081 | # As mentioned earlier, the disk files only go up through Unicode, | |
1082 | # whereas the prop_invlist() ones go as high as necessary. The | |
1083 | # comparison is only valid through max Unicode. | |
1084 | if ($i == @tested - 1 && $tested[$i] <= 0x10FFFF) { | |
1085 | $tested .= sprintf("%04X\t10FFFF\n", $tested[$i]); | |
1086 | } | |
1087 | chomp $tested; | |
1088 | if ($tested ne $official) { | |
1089 | fail_with_diff($mod_table, $official, $tested, "prop_invlist"); | |
1090 | next; | |
1091 | } | |
1092 | ||
1093 | # Here, it matched the table. Now need to check for if it is correct | |
1094 | # for beyond Unicode. First, calculate if is the default table or | |
1095 | # not. This is the same algorithm as used internally in | |
1096 | # prop_invlist(), so if it is wrong there, this test won't catch it. | |
1097 | my $prop = lc $table; | |
1098 | ($prop_only, $table) = split /\s*[:=]\s*/, $prop; | |
1099 | if (defined $table) { | |
1100 | ||
1101 | # May have optional prefixed 'is' | |
1102 | $prop = utf8::_loose_name($prop_only) =~ s/^is//r; | |
1103 | $prop = $utf8::loose_property_name_of{$prop}; | |
1104 | $prop .= "=" . utf8::_loose_name($table); | |
1105 | } | |
1106 | else { | |
1107 | $prop = utf8::_loose_name($prop); | |
1108 | } | |
1109 | my $is_default = exists $Unicode::UCD::loose_defaults{$prop}; | |
1110 | ||
1111 | @tested = @$before_invert if $invert; # Use the original | |
1112 | if (@tested % 2 == 0) { | |
1113 | ||
1114 | # If there are an even number of elements, the final one starts a | |
1115 | # range (going to infinity) of code points that are not in the | |
1116 | # list. | |
1117 | if ($is_default) { | |
1118 | fail("prop_invlist('$mod_table')"); | |
1119 | diag("default table doesn't goto infinity"); | |
1120 | use Data::Dumper; | |
1121 | diag Dumper \@tested; | |
1122 | next; | |
1123 | } | |
1124 | } | |
1125 | else { | |
1126 | # An odd number of elements means the final one starts a range | |
1127 | # (going to infinity of code points that are in the list. | |
1128 | if (! $is_default) { | |
1129 | fail("prop_invlist('$mod_table')"); | |
1130 | diag("non-default table needs to stop in the Unicode range"); | |
1131 | use Data::Dumper; | |
1132 | diag Dumper \@tested; | |
1133 | next; | |
1134 | } | |
1135 | } | |
1136 | ||
1137 | pass("prop_invlist('$mod_table')"); | |
1138 | } | |
1139 | } | |
1140 | ||
62b3b855 KW |
1141 | # Now test prop_invmap(). |
1142 | ||
1143 | @list = prop_invmap("Unknown property"); | |
1144 | is (@list, 0, "prop_invmap(<Unknown property>) returns an empty list"); | |
1145 | @list = prop_invmap(undef); | |
1146 | is (@list, 0, "prop_invmap(undef) returns an empty list"); | |
1147 | ok (! eval "prop_invmap('gc')" && $@ ne "", | |
1148 | "prop_invmap('gc') dies in scalar context"); | |
1149 | @list = prop_invmap("_X_Begin"); | |
1150 | is (@list, 0, "prop_invmap(<internal property>) returns an empty list"); | |
1151 | @list = prop_invmap("InKana"); | |
1152 | is(@list, 0, "prop_invmap(<user-defined property returns undef>)"); | |
1153 | @list = prop_invmap("Perl_Decomposition_Mapping"), undef, | |
1154 | is(@list, 0, "prop_invmap('Perl_Decomposition_Mapping') returns <undef> since internal-Perl-only"); | |
1155 | @list = prop_invmap("Perl_Charnames"), undef, | |
1156 | is(@list, 0, "prop_invmap('Perl_Charnames') returns <undef> since internal-Perl-only"); | |
1157 | @list = prop_invmap("Is_Is_Any"); | |
1158 | is(@list, 0, "prop_invmap('Is_Is_Any') returns <undef> since two is's"); | |
1159 | ||
1160 | # The set of properties to test on has already been compiled into %props by | |
1161 | # the prop_aliases() tests. | |
1162 | ||
1163 | my %tested_invmaps; | |
1164 | ||
1165 | # Like prop_invlist(), prop_invmap() is tested by comparing the results | |
1166 | # returned by the function with the tables that mktables generates. Some of | |
1167 | # these tables are directly stored as files on disk, in either the unicore or | |
1168 | # unicore/To directories, and most should be listed in the mktables generated | |
1169 | # hash %utf8::loose_property_to_file_of, with a few additional ones that this | |
1170 | # handles specially. For these, the files are read in directly, massaged, and | |
1171 | # compared with what invmap() returns. The SPECIALS hash in some of these | |
1172 | # files overrides values in the main part of the file. | |
1173 | # | |
1174 | # The other properties are tested indirectly by generating all the possible | |
1175 | # inversion lists for the property, and seeing if those match the inversion | |
1176 | # lists returned by prop_invlist(), which has already been tested. | |
1177 | ||
1178 | PROPERTY: | |
1179 | foreach my $prop (keys %props) { | |
1180 | my $loose_prop = utf8::_loose_name(lc $prop); | |
1181 | my $suppressed = grep { $_ eq $loose_prop } | |
1182 | @Unicode::UCD::suppressed_properties; | |
1183 | ||
1184 | # Find the short and full names that this property goes by | |
1185 | my ($name, $full_name) = prop_aliases($prop); | |
1186 | if (! $name) { | |
1187 | if (! $suppressed) { | |
1188 | fail("prop_invmap('$prop')"); | |
1189 | diag("is unknown to prop_aliases(), and we need it in order to test prop_invmap"); | |
1190 | } | |
1191 | next PROPERTY; | |
1192 | } | |
1193 | ||
1194 | # Normalize the short name, as it is stored in the hashes under the | |
1195 | # normalized version. | |
1196 | $name = utf8::_loose_name(lc $name); | |
1197 | ||
1198 | # Add in the characters that are supposed to be ignored to test loose | |
1199 | # matching, which the tested function applies to all properties | |
1200 | my $mod_prop = "$extra_chars$prop"; | |
1201 | ||
1202 | my ($invlist_ref, $invmap_ref, $format, $missing) = prop_invmap($mod_prop); | |
1203 | my $return_ref = [ $invlist_ref, $invmap_ref, $format, $missing ]; | |
1204 | ||
1205 | # If have already tested this property under a different name, merely | |
1206 | # compare the return from now with the saved one from before. | |
1207 | if (exists $tested_invmaps{$name}) { | |
1208 | is_deeply($return_ref, $tested_invmaps{$name}, "prop_invmap('$mod_prop') gave same results as its synonym, '$name'"); | |
1209 | next PROPERTY; | |
1210 | } | |
1211 | $tested_invmaps{$name} = dclone $return_ref; | |
1212 | ||
1213 | # If prop_invmap() returned nothing, is ok iff is a property whose file is | |
1214 | # not generated. | |
1215 | if ($suppressed) { | |
1216 | if (defined $format) { | |
1217 | fail("prop_invmap('$mod_prop')"); | |
1218 | diag("did not return undef for suppressed property $prop"); | |
1219 | } | |
1220 | next PROPERTY; | |
1221 | } | |
1222 | elsif (!defined $format) { | |
1223 | fail("prop_invmap('$mod_prop')"); | |
1224 | diag("'$prop' is unknown to prop_invmap()"); | |
1225 | next PROPERTY; | |
1226 | } | |
1227 | ||
1228 | # The two parallel arrays must have the same number of elements. | |
1229 | if (@$invlist_ref != @$invmap_ref) { | |
1230 | fail("prop_invmap('$mod_prop')"); | |
1231 | diag("invlist has " | |
1232 | . scalar @$invlist_ref | |
1233 | . " while invmap has " | |
1234 | . scalar @$invmap_ref | |
1235 | . " elements"); | |
1236 | next PROPERTY; | |
1237 | } | |
1238 | ||
1239 | # The last element must be for the above-Unicode code points, and must be | |
1240 | # for the default value. | |
1241 | if ($invlist_ref->[-1] != 0x110000) { | |
1242 | fail("prop_invmap('$mod_prop')"); | |
1243 | diag("The last inversion list element is not 0x110000"); | |
1244 | next PROPERTY; | |
1245 | } | |
1246 | if ($invmap_ref->[-1] ne $missing) { | |
1247 | fail("prop_invmap('$mod_prop')"); | |
1248 | diag("The last inversion list element is '$invmap_ref->[-1]', and should be '$missing'"); | |
1249 | next PROPERTY; | |
1250 | } | |
1251 | ||
1252 | if ($name eq 'bmg') { # This one has an atypical $missing | |
1253 | if ($missing ne "") { | |
1254 | fail("prop_invmap('$mod_prop')"); | |
1255 | diag("The missings should be \"\"; got '$missing'"); | |
1256 | next PROPERTY; | |
1257 | } | |
1258 | } | |
1259 | elsif ($format =~ /^ [cd] /x) { | |
1260 | if ($missing ne "<code point>") { | |
1261 | fail("prop_invmap('$mod_prop')"); | |
1262 | diag("The missings should be '<code point>'; got '$missing'"); | |
1263 | next PROPERTY; | |
1264 | } | |
1265 | } | |
1266 | elsif ($missing =~ /[<>]/) { | |
1267 | fail("prop_invmap('$mod_prop')"); | |
1268 | diag("The missings should NOT be something with <...>'"); | |
1269 | next PROPERTY; | |
1270 | ||
1271 | # I don't want to hard code in what all the missings should be, so | |
1272 | # those don't get fully tested. | |
1273 | } | |
1274 | ||
1275 | # Certain properties don't have their own files, but must be constructed | |
1276 | # using proxies. | |
1277 | my $proxy_prop = $name; | |
1278 | if ($full_name eq 'Present_In') { | |
1279 | $proxy_prop = "age"; # The maps for these two props are identical | |
1280 | } | |
1281 | elsif ($full_name eq 'Simple_Case_Folding' | |
1282 | || $full_name =~ /Simple_ (.) .*? case_Mapping /x) | |
1283 | { | |
1284 | if ($full_name eq 'Simple_Case_Folding') { | |
1285 | $proxy_prop = 'cf'; | |
1286 | } | |
1287 | else { | |
1288 | # We captured the U, L, or T, leading to uc, lc, or tc. | |
1289 | $proxy_prop = lc $1 . "c"; | |
1290 | } | |
1291 | if ($format ne "c") { | |
1292 | fail("prop_invmap('$mod_prop')"); | |
1293 | diag("The format should be 'c'; got '$format'"); | |
1294 | next PROPERTY; | |
1295 | } | |
1296 | } | |
1297 | ||
1298 | my $base_file; | |
1299 | my $official; | |
1300 | ||
1301 | # Handle the properties that have full disk files for them (except the | |
1302 | # Name property which is structurally enough different that it is handled | |
1303 | # separately below.) | |
1304 | if ($name ne 'na' | |
1305 | && ($name eq 'blk' | |
1306 | || defined | |
1307 | ($base_file = $utf8::loose_property_to_file_of{$proxy_prop}) | |
1308 | || exists $utf8::loose_to_file_of{$proxy_prop} | |
1309 | || $name eq "dm")) | |
1310 | { | |
1311 | # In the above, blk is done unconditionally, as we need to test that | |
1312 | # the old-style block names are returned, even if mktables has | |
1313 | # generated a file for the new-style; the test for dm comes afterward, | |
1314 | # so that if a file has been generated for it explicitly, we use that | |
1315 | # file (which is valid, unlike blk) instead of the combo | |
1316 | # Decomposition.pl files. | |
1317 | my $file; | |
1318 | my $is_binary = 0; | |
1319 | if ($name eq 'blk') { | |
1320 | ||
1321 | # The blk property is special. The original file with old block | |
1322 | # names is retained, and the default is to not write out a | |
1323 | # new-name file. What we do is get the old names into a data | |
1324 | # structure, and from that create what the new file would look | |
1325 | # like. $base_file is needed to be defined, just to avoid a | |
1326 | # message below. | |
1327 | $base_file = "This is a dummy name"; | |
1328 | my $blocks_ref = charblocks(); | |
1329 | $official = ""; | |
1330 | for my $range (sort { $a->[0][0] <=> $b->[0][0] } | |
1331 | values %$blocks_ref) | |
1332 | { | |
1333 | # Translate the charblocks() data structure to what the file | |
1334 | # would like. | |
1335 | $official .= sprintf"%04X\t%04X\t%s\n", | |
1336 | $range->[0][0], | |
1337 | $range->[0][1], | |
1338 | $range->[0][2]; | |
1339 | } | |
1340 | } | |
1341 | else { | |
18e9d873 | 1342 | $base_file = "Decomposition" if $format eq 'd'; |
62b3b855 KW |
1343 | |
1344 | # Above leaves $base_file undefined only if it came from the hash | |
1345 | # below. This should happen only when it is a binary property | |
1346 | # (and are accessing via a single-form name, like 'In_Latin1'), | |
1347 | # and so it is stored in a different directory than the To ones. | |
1348 | # XXX Currently, the only cases where it is complemented are the | |
1349 | # ones that have no code points. And it works out for these that | |
1350 | # 1) complementing them, and then 2) adding or subtracting the | |
1351 | # initial 0 and final 110000 cancel each other out. But further | |
1352 | # work would be needed in the unlikely event that an inverted | |
1353 | # property comes along without these characteristics | |
1354 | if (!defined $base_file) { | |
1355 | $base_file = $utf8::loose_to_file_of{$proxy_prop}; | |
1356 | $is_binary = ($base_file =~ s/^!//) ? -1 : 1; | |
1357 | $base_file = "lib/$base_file"; | |
1358 | } | |
1359 | ||
1360 | # Read in the file | |
62b3b855 KW |
1361 | $file = "unicore/$base_file.pl"; |
1362 | $official = do $file; | |
1363 | ||
1364 | # Get rid of any trailing space and comments in the file. | |
1365 | $official =~ s/\s*(#.*)?$//mg; | |
1366 | ||
1367 | # Decomposition.pl also has the <compatible> types in it, which | |
1368 | # should be removed. | |
1369 | $official =~ s/<.*?> //mg if $format eq 'd'; | |
1370 | } | |
1371 | chomp $official; | |
1372 | ||
1373 | # If there are any special elements, get a reference to them. | |
1374 | my $specials_ref = $utf8::file_to_swash_name{$base_file}; | |
1375 | if ($specials_ref) { | |
1376 | $specials_ref = $utf8::SwashInfo{$specials_ref}{'specials_name'}; | |
1377 | if ($specials_ref) { | |
1378 | ||
1379 | # Convert from the name to the actual reference. | |
1380 | no strict 'refs'; | |
1381 | $specials_ref = \%{$specials_ref}; | |
1382 | } | |
1383 | } | |
1384 | ||
1385 | # Certain of the proxy properties have to be adjusted to match the | |
1386 | # real ones. | |
1387 | if (($proxy_prop ne $name && $full_name =~ 'Mapping') | |
1388 | || $full_name eq 'Case_Folding') | |
1389 | { | |
1390 | ||
1391 | # Here we have either | |
1392 | # 1) Case_Folding; or | |
1393 | # 2) a proxy that is a full mapping, which means that what the | |
1394 | # real property is is the equivalent simple mapping. | |
1395 | # In both cases, the file will have a standard list containing | |
1396 | # simple mappings (to a single code point), and a specials hash | |
1397 | # which contains all the mappings that are to multiple code | |
1398 | # points. First, extract a list containing all the file's simple | |
1399 | # mappings. | |
1400 | my @list; | |
1401 | for (split "\n", $official) { | |
1402 | my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?) | |
1403 | \s* ( \# .* )? | |
1404 | $ /x; | |
1405 | $end = $start if $end eq ""; | |
1406 | if ($end ne $start) { | |
1407 | fail("prop_invmap('$mod_prop')"); | |
1408 | diag("This test is expecting only single code point ranges in $file.pl"); | |
1409 | next PROPERTY; | |
1410 | } | |
1411 | push @list, [ hex $start, $value ]; | |
1412 | } | |
1413 | ||
1414 | # For Case_Folding, the file contains all the simple mappings, | |
1415 | # including the ones that are overridden by the specials. These | |
1416 | # need to be removed as the list is for just the full ones. For | |
1417 | # the other files, the proxy is missing the simple mappings that | |
1418 | # are overridden by the specials, so we need to add them. | |
1419 | ||
1420 | # For the missing simples, we get the correct values by calling | |
1421 | # charinfo(). Set up which element of the hash returned by | |
1422 | # charinfo to look at | |
1423 | my $charinfo_element; | |
1424 | if ($full_name =~ / ^ Simple_ (Lower | Upper | Title) case_Mapping/x) | |
1425 | { | |
1426 | $charinfo_element = lc $1; # e.g. Upper is referred to by the | |
1427 | # key 'upper' in the charinfo() | |
1428 | # returned hash | |
1429 | } | |
1430 | ||
1431 | # Go through any special mappings one by one. They are packed. | |
1432 | my $i = 0; | |
1433 | foreach my $utf8_cp (sort keys %$specials_ref) { | |
1434 | my $cp = unpack("C0U", $utf8_cp); | |
1435 | ||
1436 | # Get what the simple value for this should be; either nothing | |
1437 | # for Case_Folding, or what charinfo returns for the others. | |
1438 | my $simple = ($full_name eq "Case_Folding") | |
1439 | ? "" | |
1440 | : charinfo($cp)->{$charinfo_element}; | |
1441 | ||
1442 | # And create an entry to add to the list, if appropriate | |
1443 | my $replacement; | |
1444 | $replacement = [ $cp, $simple ] if $simple ne ""; | |
1445 | ||
1446 | # Find the spot in the @list of simple mappings that this | |
1447 | # special applies to; uses a linear search. | |
1448 | while ($i < @list -1 ) { | |
1449 | last if $cp <= $list[$i][0]; | |
1450 | $i++; | |
1451 | } | |
1452 | ||
1453 | #note $i-1 . ": " . join " => ", @{$list[$i-1]}; | |
1454 | #note $i-0 . ": " . join " => ", @{$list[$i-0]}; | |
1455 | #note $i+1 . ": " . join " => ", @{$list[$i+1]}; | |
1456 | ||
1457 | if (! defined $replacement) { | |
1458 | ||
1459 | # Here, are to remove any existing entry for this code | |
1460 | # point. | |
1461 | next if $cp != $list[$i][0]; | |
1462 | splice @list, $i, 1; | |
1463 | } | |
1464 | elsif ($cp == $list[$i][0]) { | |
1465 | ||
1466 | # Here, are to add something, but there is an existing | |
1467 | # entry, so this just replaces it. | |
1468 | $list[$i] = $replacement; | |
1469 | } | |
1470 | else { | |
1471 | ||
1472 | # Here, are to add something, and there isn't an existing | |
1473 | # entry. | |
1474 | splice @list, $i, 0, $replacement; | |
1475 | } | |
1476 | ||
1477 | #note __LINE__ . ": $cp"; | |
1478 | #note $i-1 . ": " . join " => ", @{$list[$i-1]}; | |
1479 | #note $i-0 . ": " . join " => ", @{$list[$i-0]}; | |
1480 | #note $i+1 . ": " . join " => ", @{$list[$i+1]}; | |
1481 | } | |
1482 | ||
1483 | # Here, have gone through all the specials, modifying @list as | |
1484 | # needed. Turn it back into what the file should look like. | |
1485 | $official = join "\n", map { sprintf "%04X\t\t%s", @$_ } @list; | |
1486 | ||
1487 | # And, no longer need the specials for the simple mappings, as are | |
1488 | # all incorporated into $official | |
1489 | undef $specials_ref if $full_name ne 'Case_Folding'; | |
1490 | } | |
1491 | elsif ($full_name eq 'Simple_Case_Folding') { | |
1492 | ||
1493 | # This property has everything in the regular array, and the | |
1494 | # specials are superfluous. | |
1495 | undef $specials_ref if $full_name ne 'Case_Folding'; | |
1496 | } | |
1497 | ||
1498 | # Here, in $official, we have what the file looks like, or should like | |
1499 | # if we've had to fix it up. Now take the invmap() output and reverse | |
1500 | # engineer from that what the file should look like. Each iteration | |
1501 | # appends the next line to the running string. | |
1502 | my $tested_map = ""; | |
1503 | ||
1504 | # Create a copy of the file's specials hash. (It has been undef'd if | |
1505 | # we know it isn't relevant to this property, so if it exists, it's an | |
1506 | # error or is relevant). As we go along, we delete from that copy. | |
1507 | # If a delete fails, or something is left over after we are done, | |
1508 | # it's an error | |
1509 | my %specials = %$specials_ref if $specials_ref; | |
1510 | ||
1511 | # The extra -1 is because the final element has been tested above to | |
1512 | # be for anything above Unicode. The file doesn't go that high. | |
1513 | for my $i (0 .. @$invlist_ref - 1 - 1) { | |
1514 | ||
1515 | # If the map element is a reference, have to stringify it (but | |
1516 | # don't do so if the format doesn't allow references, so that an | |
1517 | # improper format will generate an error. | |
1518 | if (ref $invmap_ref->[$i] | |
1519 | && ($format eq 'd' || $format =~ /^ . l /x)) | |
1520 | { | |
1521 | # The stringification depends on the format. At the time of | |
1522 | # this writing, all 'sl' formats are space separated. | |
1523 | if ($format eq 'sl') { | |
1524 | $invmap_ref->[$i] = join " ", @{$invmap_ref->[$i]}; | |
1525 | } | |
1526 | elsif ($format =~ / ^ cl e? $/x) { | |
1527 | ||
1528 | # For a cl property, the stringified result should be in | |
1529 | # the specials hash. The key is the packed code point, | |
1530 | # and the value is the packed map. | |
1531 | my $value; | |
1532 | if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) { | |
1533 | fail("prop_invmap('$mod_prop')"); | |
1534 | diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]); | |
1535 | next PROPERTY; | |
1536 | } | |
1537 | my $packed = pack "U*", @{$invmap_ref->[$i]}; | |
1538 | if ($value ne $packed) { | |
1539 | fail("prop_invmap('$mod_prop')"); | |
1540 | diag(sprintf "For %04X, expected the mapping to be '$packed', but got '$value'"); | |
1541 | next PROPERTY; | |
1542 | } | |
1543 | ||
1544 | # As this doesn't get tested when we later compare with | |
1545 | # the actual file, it could be out of order and we | |
1546 | # wouldn't know it. | |
1547 | if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) | |
1548 | || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) | |
1549 | { | |
1550 | fail("prop_invmap('$mod_prop')"); | |
1551 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1552 | next PROPERTY; | |
1553 | } | |
1554 | next; | |
1555 | } | |
1556 | elsif ($format eq 'd') { | |
1557 | ||
1558 | # The decomposition mapping file has the code points as | |
1559 | # a string of space-separated hex constants. | |
1560 | $invmap_ref->[$i] = join " ", map { sprintf "%04X", $_ } @{$invmap_ref->[$i]}; | |
1561 | } | |
1562 | else { | |
1563 | fail("prop_invmap('$mod_prop')"); | |
1564 | diag("Can't handle format '$format'"); | |
1565 | next PROPERTY; | |
1566 | } | |
1567 | } | |
1568 | elsif ($format eq 'cle' && $invmap_ref->[$i] eq "") { | |
1569 | ||
1570 | # cle properties have maps to the empty string that also | |
1571 | # should be in the specials hash, with the key the packed code | |
1572 | # point, and the map just empty. | |
1573 | my $value; | |
1574 | if (! defined ($value = delete $specials{pack("C0U", $invlist_ref->[$i]) })) { | |
1575 | fail("prop_invmap('$mod_prop')"); | |
1576 | diag(sprintf "There was no specials element for %04X", $invlist_ref->[$i]); | |
1577 | next PROPERTY; | |
1578 | } | |
1579 | if ($value ne "") { | |
1580 | fail("prop_invmap('$mod_prop')"); | |
1581 | diag(sprintf "For %04X, expected the mapping to be \"\", but got '$value'", $invlist_ref->[$i]); | |
1582 | next PROPERTY; | |
1583 | } | |
1584 | ||
1585 | # As this doesn't get tested when we later compare with | |
1586 | # the actual file, it could be out of order and we | |
1587 | # wouldn't know it. | |
1588 | if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) | |
1589 | || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) | |
1590 | { | |
1591 | fail("prop_invmap('$mod_prop')"); | |
1592 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1593 | next PROPERTY; | |
1594 | } | |
1595 | next; | |
1596 | } | |
1597 | elsif ($is_binary) { # These binary files don't have an explicit Y | |
1598 | $invmap_ref->[$i] =~ s/Y//; | |
1599 | } | |
1600 | ||
1601 | # The file doesn't include entries that map to $missing, so don't | |
1602 | # include it in the built-up string. But make sure that it is in | |
1603 | # the correct order in the input. | |
1604 | if ($invmap_ref->[$i] eq $missing) { | |
1605 | if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) | |
1606 | || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) | |
1607 | { | |
1608 | fail("prop_invmap('$mod_prop')"); | |
1609 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1610 | next PROPERTY; | |
1611 | } | |
1612 | next; | |
1613 | } | |
1614 | ||
1615 | # 'c'-type and 'd' properties have the mapping expressed in hex in | |
1616 | # the file | |
1617 | if ($format =~ /^ [cd] /x) { | |
1618 | ||
1619 | # The d property has one entry which isn't in the file. | |
1620 | # Ignore it, but make sure it is in order. | |
1621 | if ($format eq 'd' | |
1622 | && $invmap_ref->[$i] eq '<hangul syllable>' | |
1623 | && $invlist_ref->[$i] == 0xAC00) | |
1624 | { | |
1625 | if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) | |
1626 | || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) | |
1627 | { | |
1628 | fail("prop_invmap('$mod_prop')"); | |
1629 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1630 | next PROPERTY; | |
1631 | } | |
1632 | next; | |
1633 | } | |
1634 | $invmap_ref->[$i] = sprintf("%04X", $invmap_ref->[$i]) | |
1635 | if $invmap_ref->[$i] =~ / ^ [A-Fa-f0-9]+ $/x; | |
1636 | } | |
1637 | ||
1638 | # Finally have figured out what the map column in the file should | |
1639 | # be. Append the line to the running string. | |
1640 | my $start = $invlist_ref->[$i]; | |
1641 | my $end = $invlist_ref->[$i+1] - 1; | |
1642 | $end = ($start == $end) ? "" : sprintf("%04X", $end); | |
1643 | if ($invmap_ref->[$i] ne "") { | |
1644 | $tested_map .= sprintf "%04X\t%s\t%s\n", $start, $end, $invmap_ref->[$i]; | |
1645 | } | |
1646 | elsif ($end ne "") { | |
1647 | $tested_map .= sprintf "%04X\t%s\n", $start, $end; | |
1648 | } | |
1649 | else { | |
1650 | $tested_map .= sprintf "%04X\n", $start; | |
1651 | } | |
1652 | } # End of looping over all elements. | |
1653 | ||
1654 | # Here are done with generating what the file should look like | |
1655 | ||
1656 | chomp $tested_map; | |
1657 | ||
1658 | # And compare. | |
1659 | if ($tested_map ne $official) { | |
1660 | fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap"); | |
1661 | next PROPERTY; | |
1662 | } | |
1663 | ||
1664 | # There shouldn't be any specials unaccounted for. | |
1665 | if (keys %specials) { | |
1666 | fail("prop_invmap('$mod_prop')"); | |
1667 | diag("Unexpected specials: " . join ", ", keys %specials); | |
1668 | next PROPERTY; | |
1669 | } | |
1670 | } | |
1671 | elsif ($format eq 'n') { | |
1672 | ||
1673 | # Handle the Name property similar to the above. But the file is | |
1674 | # sufficiently different that it is more convenient to make a special | |
1675 | # case for it. | |
1676 | ||
1677 | if ($missing ne "") { | |
1678 | fail("prop_invmap('$mod_prop')"); | |
1679 | diag("The missings should be \"\"; got \"missing\""); | |
1680 | next PROPERTY; | |
1681 | } | |
1682 | ||
1683 | $official = do "unicore/Name.pl"; | |
1684 | ||
1685 | # Get rid of the named sequences portion of the file. These don't | |
1686 | # have a tab before the first blank on a line. | |
1687 | $official =~ s/ ^ [^\t]+ \ .*? \n //xmg; | |
1688 | ||
1689 | # And get rid of the controls. These are named in the file, but | |
1690 | # shouldn't be in the property. | |
1691 | $official =~ s/ 00000 \t .* 0001F .*? \n//xs; | |
1692 | $official =~ s/ 0007F \t .* 0009F .*? \n//xs; | |
1693 | ||
1694 | # This is slow; it gets rid of the aliases. We look for lines that | |
1695 | # are for the same code point as the previous line. The previous line | |
1696 | # will be a name_alias; and the current line will be the name. Get | |
1697 | # rid of the name_alias line. This won't work if there are multiple | |
1698 | # aliases for a given name. | |
1699 | my @temp_names = split "\n", $official; | |
1700 | my $previous_cp = ""; | |
1701 | for (my $i = 0; $i < @temp_names - 1; $i++) { | |
1702 | $temp_names[$i] =~ /^ (.*)? \t /x; | |
1703 | my $current_cp = $1; | |
1704 | if ($current_cp eq $previous_cp) { | |
1705 | splice @temp_names, $i - 1, 1; | |
1706 | redo; | |
1707 | } | |
1708 | else { | |
1709 | $previous_cp = $current_cp; | |
1710 | } | |
1711 | } | |
1712 | $official = join "\n", @temp_names; | |
1713 | undef @temp_names; | |
1714 | chomp $official; | |
1715 | ||
1716 | # Here have adjusted the file. We also have to adjust the returned | |
1717 | # inversion map by checking and deleting all the lines in it that | |
1718 | # won't be in the file. These are the lines that have generated | |
1719 | # things, like <hangul syllable>. | |
1720 | my $tested_map = ""; # Current running string | |
1721 | my @code_point_in_names = | |
1722 | @Unicode::UCD::code_points_ending_in_code_point; | |
1723 | ||
1724 | for my $i (0 .. @$invlist_ref - 1 - 1) { | |
1725 | my $start = $invlist_ref->[$i]; | |
1726 | my $end = $invlist_ref->[$i+1] - 1; | |
1727 | if ($invmap_ref->[$i] eq $missing) { | |
1728 | if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) | |
1729 | || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) | |
1730 | { | |
1731 | fail("prop_invmap('$mod_prop')"); | |
1732 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1733 | next PROPERTY; | |
1734 | } | |
1735 | next; | |
1736 | } | |
1737 | if ($invmap_ref->[$i] =~ / (.*) ( < .*? > )/x) { | |
1738 | my $name = $1; | |
1739 | my $type = $2; | |
1740 | if (($i > 0 && $invlist_ref->[$i] <= $invlist_ref->[$i-1]) | |
1741 | || $invlist_ref->[$i] >= $invlist_ref->[$i+1]) | |
1742 | { | |
1743 | fail("prop_invmap('$mod_prop')"); | |
1744 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1745 | next PROPERTY; | |
1746 | } | |
1747 | if ($type eq "<hangul syllable>") { | |
1748 | if ($name ne "") { | |
1749 | fail("prop_invmap('$mod_prop')"); | |
1750 | diag("Unexpected text in $invmap_ref->[$i]"); | |
1751 | next PROPERTY; | |
1752 | } | |
1753 | if ($start != 0xAC00) { | |
1754 | fail("prop_invmap('$mod_prop')"); | |
1755 | diag(sprintf("<hangul syllables> should begin at 0xAC00, got %04X", $start)); | |
1756 | next PROPERTY; | |
1757 | } | |
1758 | if ($end != $start + 11172 - 1) { | |
1759 | fail("prop_invmap('$mod_prop')"); | |
1760 | diag(sprintf("<hangul syllables> should end at %04X, got %04X", $start + 11172 -1, $end)); | |
1761 | next PROPERTY; | |
1762 | } | |
1763 | } | |
1764 | elsif ($type ne "<code point>") { | |
1765 | fail("prop_invmap('$mod_prop')"); | |
1766 | diag("Unexpected text '$type' in $invmap_ref->[$i]"); | |
1767 | next PROPERTY; | |
1768 | } | |
1769 | else { | |
1770 | ||
1771 | # Look through the array of names that end in code points, | |
1772 | # and look for this start and end. If not found is an | |
1773 | # error. If found, delete it, and at the end, make sure | |
1774 | # have deleted everything. | |
1775 | for my $i (0 .. @code_point_in_names - 1) { | |
1776 | my $hash = $code_point_in_names[$i]; | |
1777 | if ($hash->{'low'} == $start | |
1778 | && $hash->{'high'} == $end | |
1779 | && "$hash->{'name'}-" eq $name) | |
1780 | { | |
1781 | splice @code_point_in_names, $i, 1; | |
1782 | last; | |
1783 | } | |
1784 | else { | |
1785 | fail("prop_invmap('$mod_prop')"); | |
1786 | diag("Unexpected code-point-in-name line '$invmap_ref->[$i]'"); | |
1787 | next PROPERTY; | |
1788 | } | |
1789 | } | |
1790 | } | |
1791 | ||
1792 | next; | |
1793 | } | |
1794 | ||
1795 | # Have adjusted the map, as needed. Append to running string. | |
1796 | $end = ($start == $end) ? "" : sprintf("%05X", $end); | |
1797 | $tested_map .= sprintf "%05X\t%s\n", $start, $invmap_ref->[$i]; | |
1798 | } | |
1799 | ||
1800 | # Finished creating the string from the inversion map. Can compare | |
1801 | # with what the file is. | |
1802 | chomp $tested_map; | |
1803 | if ($tested_map ne $official) { | |
1804 | fail_with_diff($mod_prop, $official, $tested_map, "prop_invmap"); | |
1805 | next PROPERTY; | |
1806 | } | |
1807 | if (@code_point_in_names) { | |
1808 | fail("prop_invmap('$mod_prop')"); | |
1809 | use Data::Dumper; | |
1810 | diag("Missing code-point-in-name line(s)" . Dumper \@code_point_in_names); | |
1811 | next PROPERTY; | |
1812 | } | |
1813 | } | |
1814 | elsif ($format eq 's' || $format eq 'r') { | |
1815 | ||
1816 | # Here the map is not more or less directly from a file stored on | |
1817 | # disk. We try a different tack. These should all be properties that | |
1818 | # have just a few possible values (most of them are binary). We go | |
1819 | # through the map list, sorting each range into buckets, one for each | |
1820 | # map value. Thus for binary properties there will be a bucket for Y | |
1821 | # and one for N. The buckets are inversion lists. We compare each | |
1822 | # constructed inversion list with what we would get for it using | |
1823 | # prop_invlist(), which has already been tested. If they all match, | |
1824 | # the whole map must have matched. | |
1825 | my %maps; | |
1826 | my $previous_map; | |
1827 | ||
1828 | # (The extra -1 is to not look at the final element in the loop, which | |
1829 | # we know is the one that starts just beyond Unicode and goes to | |
1830 | # infinity.) | |
1831 | for my $i (0 .. @$invlist_ref - 1 - 1) { | |
1832 | my $range_start = $invlist_ref->[$i]; | |
1833 | ||
1834 | # Because we are sorting into buckets, things could be | |
1835 | # out-of-order here, and still be in the correct order in the | |
1836 | # bucket, and hence wouldn't show up as an error; so have to | |
1837 | # check. | |
1838 | if (($i > 0 && $range_start <= $invlist_ref->[$i-1]) | |
1839 | || $range_start >= $invlist_ref->[$i+1]) | |
1840 | { | |
1841 | fail("prop_invmap('$mod_prop')"); | |
1842 | diag(sprintf "Range beginning at %04X is out-of-order.", $invlist_ref->[$i]); | |
1843 | next PROPERTY; | |
1844 | } | |
1845 | ||
1846 | # This new range closes out the range started in the previous | |
1847 | # iteration. | |
1848 | push @{$maps{$previous_map}}, $range_start if defined $previous_map; | |
1849 | ||
1850 | # And starts a range which will be closed in the next iteration. | |
1851 | $previous_map = $invmap_ref->[$i]; | |
1852 | push @{$maps{$previous_map}}, $range_start; | |
1853 | } | |
1854 | ||
1855 | # The range we just started hasn't been closed, and we didn't look at | |
1856 | # the final element of the loop. If that range is for the default | |
1857 | # value, it shouldn't be closed, as it is to extend to infinity. But | |
1858 | # otherwise, it should end at the final Unicode code point, and the | |
1859 | # list that maps to the default value should have another element that | |
1860 | # does go to infinity for every above Unicode code point. | |
1861 | ||
1862 | if (@$invlist_ref > 1) { | |
1863 | my $penultimate_map = $invmap_ref->[-2]; | |
1864 | if ($penultimate_map ne $missing) { | |
1865 | ||
1866 | # The -1th element contains the first non-Unicode code point. | |
1867 | push @{$maps{$penultimate_map}}, $invlist_ref->[-1]; | |
1868 | push @{$maps{$missing}}, $invlist_ref->[-1]; | |
1869 | } | |
1870 | } | |
1871 | ||
1872 | # Here, we have the buckets (inversion lists) all constructed. Go | |
1873 | # through each and verify that matches what prop_invlist() returns. | |
1874 | # We could use is_deeply() for the comparison, but would get multiple | |
1875 | # messages for each $prop. | |
1876 | foreach my $map (keys %maps) { | |
1877 | my @off_invlist = prop_invlist("$prop = $map"); | |
1878 | my $min = (@off_invlist >= @{$maps{$map}}) | |
1879 | ? @off_invlist | |
1880 | : @{$maps{$map}}; | |
1881 | for my $i (0 .. $min- 1) { | |
1882 | if ($i > @off_invlist - 1) { | |
1883 | fail("prop_invmap('$mod_prop')"); | |
1884 | diag("There is no element [$i] for $prop=$map from prop_invlist(), while [$i] in the implicit one constructed from prop_invmap() is '$maps{$map}[$i]'"); | |
1885 | next PROPERTY; | |
1886 | } | |
1887 | elsif ($i > @{$maps{$map}} - 1) { | |
1888 | fail("prop_invmap('$mod_prop')"); | |
1889 | diag("There is no element [$i] from the implicit $prop=$map constructed from prop_invmap(), while [$i] in the one from prop_invlist() is '$off_invlist[$i]'"); | |
1890 | next PROPERTY; | |
1891 | } | |
1892 | elsif ($maps{$map}[$i] ne $off_invlist[$i]) { | |
1893 | fail("prop_invmap('$mod_prop')"); | |
1894 | diag("Element [$i] of the implicit $prop=$map constructed from prop_invmap() is '$maps{$map}[$i]', and the one from prop_invlist() is '$off_invlist[$i]'"); | |
1895 | next PROPERTY; | |
1896 | } | |
1897 | } | |
1898 | } | |
1899 | } | |
1900 | else { # Don't know this property nor format. | |
1901 | ||
1902 | fail("prop_invmap('$mod_prop')"); | |
1903 | diag("Unknown format '$format'"); | |
1904 | } | |
1905 | ||
1906 | pass("prop_invmap('$mod_prop')"); | |
1907 | } | |
1908 | ||
eaebe4db | 1909 | done_testing(); |