This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
UCD.t: Add test for non-Unicode code point
[perl5.git] / lib / Unicode / UCD.t
1 #!perl -w
2 BEGIN {
3     if (ord("A") != 65) {
4         print "1..0 # Skip: EBCDIC\n";
5         exit 0;
6     }
7     chdir 't' if -d 't';
8     @INC = '../lib';
9     require Config; import Config;
10     if ($Config{'extensions'} !~ /\bStorable\b/) {
11         print "1..0 # Skip: Storable was not built; Unicode::UCD uses Storable\n";
12         exit 0;
13     }
14 }
15
16 use strict;
17 use Unicode::UCD;
18 use Test::More;
19
20 BEGIN { plan tests => 270 };
21
22 use Unicode::UCD 'charinfo';
23
24 my $charinfo;
25
26 is(charinfo(0x110000), undef, "Verify charinfo() of non-unicode is undef");
27
28 $charinfo = charinfo(0);    # Null is often problematic, so test it.
29
30 is($charinfo->{code},           '0000', '<control>');
31 is($charinfo->{name},           '<control>');
32 is($charinfo->{category},       'Cc');
33 is($charinfo->{combining},      '0');
34 is($charinfo->{bidi},           'BN');
35 is($charinfo->{decomposition},  '');
36 is($charinfo->{decimal},        '');
37 is($charinfo->{digit},          '');
38 is($charinfo->{numeric},        '');
39 is($charinfo->{mirrored},       'N');
40 is($charinfo->{unicode10},      'NULL');
41 is($charinfo->{comment},        '');
42 is($charinfo->{upper},          '');
43 is($charinfo->{lower},          '');
44 is($charinfo->{title},          '');
45 is($charinfo->{block},          'Basic Latin');
46 is($charinfo->{script},         'Common');
47
48 $charinfo = charinfo(0x41);
49
50 is($charinfo->{code},           '0041', 'LATIN CAPITAL LETTER A');
51 is($charinfo->{name},           'LATIN CAPITAL LETTER A');
52 is($charinfo->{category},       'Lu');
53 is($charinfo->{combining},      '0');
54 is($charinfo->{bidi},           'L');
55 is($charinfo->{decomposition},  '');
56 is($charinfo->{decimal},        '');
57 is($charinfo->{digit},          '');
58 is($charinfo->{numeric},        '');
59 is($charinfo->{mirrored},       'N');
60 is($charinfo->{unicode10},      '');
61 is($charinfo->{comment},        '');
62 is($charinfo->{upper},          '');
63 is($charinfo->{lower},          '0061');
64 is($charinfo->{title},          '');
65 is($charinfo->{block},          'Basic Latin');
66 is($charinfo->{script},         'Latin');
67
68 $charinfo = charinfo(0x100);
69
70 is($charinfo->{code},           '0100', 'LATIN CAPITAL LETTER A WITH MACRON');
71 is($charinfo->{name},           'LATIN CAPITAL LETTER A WITH MACRON');
72 is($charinfo->{category},       'Lu');
73 is($charinfo->{combining},      '0');
74 is($charinfo->{bidi},           'L');
75 is($charinfo->{decomposition},  '0041 0304');
76 is($charinfo->{decimal},        '');
77 is($charinfo->{digit},          '');
78 is($charinfo->{numeric},        '');
79 is($charinfo->{mirrored},       'N');
80 is($charinfo->{unicode10},      'LATIN CAPITAL LETTER A MACRON');
81 is($charinfo->{comment},        '');
82 is($charinfo->{upper},          '');
83 is($charinfo->{lower},          '0101');
84 is($charinfo->{title},          '');
85 is($charinfo->{block},          'Latin Extended-A');
86 is($charinfo->{script},         'Latin');
87
88 # 0x0590 is in the Hebrew block but unused.
89
90 $charinfo = charinfo(0x590);
91
92 is($charinfo->{code},          undef,   '0x0590 - unused Hebrew');
93 is($charinfo->{name},          undef);
94 is($charinfo->{category},      undef);
95 is($charinfo->{combining},     undef);
96 is($charinfo->{bidi},          undef);
97 is($charinfo->{decomposition}, undef);
98 is($charinfo->{decimal},       undef);
99 is($charinfo->{digit},         undef);
100 is($charinfo->{numeric},       undef);
101 is($charinfo->{mirrored},      undef);
102 is($charinfo->{unicode10},     undef);
103 is($charinfo->{comment},       undef);
104 is($charinfo->{upper},         undef);
105 is($charinfo->{lower},         undef);
106 is($charinfo->{title},         undef);
107 is($charinfo->{block},         undef);
108 is($charinfo->{script},        undef);
109
110 # 0x05d0 is in the Hebrew block and used.
111
112 $charinfo = charinfo(0x5d0);
113
114 is($charinfo->{code},           '05D0', '05D0 - used Hebrew');
115 is($charinfo->{name},           'HEBREW LETTER ALEF');
116 is($charinfo->{category},       'Lo');
117 is($charinfo->{combining},      '0');
118 is($charinfo->{bidi},           'R');
119 is($charinfo->{decomposition},  '');
120 is($charinfo->{decimal},        '');
121 is($charinfo->{digit},          '');
122 is($charinfo->{numeric},        '');
123 is($charinfo->{mirrored},       'N');
124 is($charinfo->{unicode10},      '');
125 is($charinfo->{comment},        '');
126 is($charinfo->{upper},          '');
127 is($charinfo->{lower},          '');
128 is($charinfo->{title},          '');
129 is($charinfo->{block},          'Hebrew');
130 is($charinfo->{script},         'Hebrew');
131
132 # An open syllable in Hangul.
133
134 $charinfo = charinfo(0xAC00);
135
136 is($charinfo->{code},           'AC00', 'HANGUL SYLLABLE-AC00');
137 is($charinfo->{name},           'HANGUL SYLLABLE-AC00');
138 is($charinfo->{category},       'Lo');
139 is($charinfo->{combining},      '0');
140 is($charinfo->{bidi},           'L');
141 is($charinfo->{decomposition},  undef);
142 is($charinfo->{decimal},        '');
143 is($charinfo->{digit},          '');
144 is($charinfo->{numeric},        '');
145 is($charinfo->{mirrored},       'N');
146 is($charinfo->{unicode10},      '');
147 is($charinfo->{comment},        '');
148 is($charinfo->{upper},          '');
149 is($charinfo->{lower},          '');
150 is($charinfo->{title},          '');
151 is($charinfo->{block},          'Hangul Syllables');
152 is($charinfo->{script},         'Hangul');
153
154 # A closed syllable in Hangul.
155
156 $charinfo = charinfo(0xAE00);
157
158 is($charinfo->{code},           'AE00', 'HANGUL SYLLABLE-AE00');
159 is($charinfo->{name},           'HANGUL SYLLABLE-AE00');
160 is($charinfo->{category},       'Lo');
161 is($charinfo->{combining},      '0');
162 is($charinfo->{bidi},           'L');
163 is($charinfo->{decomposition},  undef);
164 is($charinfo->{decimal},        '');
165 is($charinfo->{digit},          '');
166 is($charinfo->{numeric},        '');
167 is($charinfo->{mirrored},       'N');
168 is($charinfo->{unicode10},      '');
169 is($charinfo->{comment},        '');
170 is($charinfo->{upper},          '');
171 is($charinfo->{lower},          '');
172 is($charinfo->{title},          '');
173 is($charinfo->{block},          'Hangul Syllables');
174 is($charinfo->{script},         'Hangul');
175
176 $charinfo = charinfo(0x1D400);
177
178 is($charinfo->{code},           '1D400', 'MATHEMATICAL BOLD CAPITAL A');
179 is($charinfo->{name},           'MATHEMATICAL BOLD CAPITAL A');
180 is($charinfo->{category},       'Lu');
181 is($charinfo->{combining},      '0');
182 is($charinfo->{bidi},           'L');
183 is($charinfo->{decomposition},  '<font> 0041');
184 is($charinfo->{decimal},        '');
185 is($charinfo->{digit},          '');
186 is($charinfo->{numeric},        '');
187 is($charinfo->{mirrored},       'N');
188 is($charinfo->{unicode10},      '');
189 is($charinfo->{comment},        '');
190 is($charinfo->{upper},          '');
191 is($charinfo->{lower},          '');
192 is($charinfo->{title},          '');
193 is($charinfo->{block},          'Mathematical Alphanumeric Symbols');
194 is($charinfo->{script},         'Common');
195
196 $charinfo = charinfo(0x9FBA);   #Bug 58428
197
198 is($charinfo->{code},           '9FBA', 'U+9FBA');
199 is($charinfo->{name},           'CJK UNIFIED IDEOGRAPH-9FBA');
200 is($charinfo->{category},       'Lo');
201 is($charinfo->{combining},      '0');
202 is($charinfo->{bidi},           'L');
203 is($charinfo->{decomposition},  '');
204 is($charinfo->{decimal},        '');
205 is($charinfo->{digit},          '');
206 is($charinfo->{numeric},        '');
207 is($charinfo->{mirrored},       'N');
208 is($charinfo->{unicode10},      '');
209 is($charinfo->{comment},        '');
210 is($charinfo->{upper},          '');
211 is($charinfo->{lower},          '');
212 is($charinfo->{title},          '');
213 is($charinfo->{block},          'CJK Unified Ideographs');
214 is($charinfo->{script},         'Han');
215
216 use Unicode::UCD qw(charblock charscript);
217
218 # 0x0590 is in the Hebrew block but unused.
219
220 is(charblock(0x590),          'Hebrew', '0x0590 - Hebrew unused charblock');
221 is(charscript(0x590),         undef,    '0x0590 - Hebrew unused charscript');
222
223 $charinfo = charinfo(0xbe);
224
225 is($charinfo->{code},           '00BE', 'VULGAR FRACTION THREE QUARTERS');
226 is($charinfo->{name},           'VULGAR FRACTION THREE QUARTERS');
227 is($charinfo->{category},       'No');
228 is($charinfo->{combining},      '0');
229 is($charinfo->{bidi},           'ON');
230 is($charinfo->{decomposition},  '<fraction> 0033 2044 0034');
231 is($charinfo->{decimal},        '');
232 is($charinfo->{digit},          '');
233 is($charinfo->{numeric},        '3/4');
234 is($charinfo->{mirrored},       'N');
235 is($charinfo->{unicode10},      'FRACTION THREE QUARTERS');
236 is($charinfo->{comment},        '');
237 is($charinfo->{upper},          '');
238 is($charinfo->{lower},          '');
239 is($charinfo->{title},          '');
240 is($charinfo->{block},          'Latin-1 Supplement');
241 is($charinfo->{script},         'Common');
242
243 use Unicode::UCD qw(charblocks charscripts);
244
245 my $charblocks = charblocks();
246
247 ok(exists $charblocks->{Thai}, 'Thai charblock exists');
248 is($charblocks->{Thai}->[0]->[0], hex('0e00'));
249 ok(!exists $charblocks->{PigLatin}, 'PigLatin charblock does not exist');
250
251 my $charscripts = charscripts();
252
253 ok(exists $charscripts->{Armenian}, 'Armenian charscript exists');
254 is($charscripts->{Armenian}->[0]->[0], hex('0531'));
255 ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist');
256
257 my $charscript;
258
259 $charscript = charscript("12ab");
260 is($charscript, 'Ethiopic', 'Ethiopic charscript');
261
262 $charscript = charscript("0x12ab");
263 is($charscript, 'Ethiopic');
264
265 $charscript = charscript("U+12ab");
266 is($charscript, 'Ethiopic');
267
268 my $ranges;
269
270 $ranges = charscript('Ogham');
271 is($ranges->[0]->[0], hex('1680'), 'Ogham charscript');
272 is($ranges->[0]->[1], hex('169C'));
273
274 use Unicode::UCD qw(charinrange);
275
276 $ranges = charscript('Cherokee');
277 ok(!charinrange($ranges, "139f"), 'Cherokee charscript');
278 ok( charinrange($ranges, "13a0"));
279 ok( charinrange($ranges, "13f4"));
280 ok(!charinrange($ranges, "13f5"));
281
282 use Unicode::UCD qw(general_categories);
283
284 my $gc = general_categories();
285
286 ok(exists $gc->{L}, 'has L');
287 is($gc->{L}, 'Letter', 'L is Letter');
288 is($gc->{Lu}, 'UppercaseLetter', 'Lu is UppercaseLetter');
289
290 use Unicode::UCD qw(bidi_types);
291
292 my $bt = bidi_types();
293
294 ok(exists $bt->{L}, 'has L');
295 is($bt->{L}, 'Left-to-Right', 'L is Left-to-Right');
296 is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic');
297
298 # If this fails, then maybe one should look at the Unicode changes to see
299 # what else might need to be updated.
300 is(Unicode::UCD::UnicodeVersion, '6.0.0', 'UnicodeVersion');
301
302 use Unicode::UCD qw(compexcl);
303
304 ok(!compexcl(0x0100), 'compexcl');
305 ok(!compexcl(0xD801), 'compexcl of surrogate');
306 ok(!compexcl(0x110000), 'compexcl of non-Unicode code point');
307 ok( compexcl(0x0958));
308
309 use Unicode::UCD qw(casefold);
310
311 my $casefold;
312
313 $casefold = casefold(0x41);
314
315 is($casefold->{code}, '0041', 'casefold 0x41 code');
316 is($casefold->{status}, 'C', 'casefold 0x41 status');
317 is($casefold->{mapping}, '0061', 'casefold 0x41 mapping');
318 is($casefold->{full}, '0061', 'casefold 0x41 full');
319 is($casefold->{simple}, '0061', 'casefold 0x41 simple');
320 is($casefold->{turkic}, "", 'casefold 0x41 turkic');
321
322 $casefold = casefold(0xdf);
323
324 is($casefold->{code}, '00DF', 'casefold 0xDF code');
325 is($casefold->{status}, 'F', 'casefold 0xDF status');
326 is($casefold->{mapping}, '0073 0073', 'casefold 0xDF mapping');
327 is($casefold->{full}, '0073 0073', 'casefold 0xDF full');
328 is($casefold->{simple}, "", 'casefold 0xDF simple');
329 is($casefold->{turkic}, "", 'casefold 0xDF turkic');
330
331 # Do different tests depending on if version <= 3.1, or not.
332 (my $version = Unicode::UCD::UnicodeVersion) =~ /^(\d+)\.(\d+)/;
333 if (defined $1 && ($1 <= 2 || $1 == 3 && defined $2 && $2 <= 1)) {
334         $casefold = casefold(0x130);
335
336         is($casefold->{code}, '0130', 'casefold 0x130 code');
337         is($casefold->{status}, 'I' , 'casefold 0x130 status');
338         is($casefold->{mapping}, '0069', 'casefold 0x130 mapping');
339         is($casefold->{full}, '0069', 'casefold 0x130 full');
340         is($casefold->{simple}, "0069", 'casefold 0x130 simple');
341         is($casefold->{turkic}, "0069", 'casefold 0x130 turkic');
342
343         $casefold = casefold(0x131);
344
345         is($casefold->{code}, '0131', 'casefold 0x131 code');
346         is($casefold->{status}, 'I' , 'casefold 0x131 status');
347         is($casefold->{mapping}, '0069', 'casefold 0x131 mapping');
348         is($casefold->{full}, '0069', 'casefold 0x131 full');
349         is($casefold->{simple}, "0069", 'casefold 0x131 simple');
350         is($casefold->{turkic}, "0069", 'casefold 0x131 turkic');
351 } else {
352         $casefold = casefold(0x49);
353
354         is($casefold->{code}, '0049', 'casefold 0x49 code');
355         is($casefold->{status}, 'C' , 'casefold 0x49 status');
356         is($casefold->{mapping}, '0069', 'casefold 0x49 mapping');
357         is($casefold->{full}, '0069', 'casefold 0x49 full');
358         is($casefold->{simple}, "0069", 'casefold 0x49 simple');
359         is($casefold->{turkic}, "0131", 'casefold 0x49 turkic');
360
361         $casefold = casefold(0x130);
362
363         is($casefold->{code}, '0130', 'casefold 0x130 code');
364         is($casefold->{status}, 'F' , 'casefold 0x130 status');
365         is($casefold->{mapping}, '0069 0307', 'casefold 0x130 mapping');
366         is($casefold->{full}, '0069 0307', 'casefold 0x130 full');
367         is($casefold->{simple}, "", 'casefold 0x130 simple');
368         is($casefold->{turkic}, "0069", 'casefold 0x130 turkic');
369 }
370
371 $casefold = casefold(0x1F88);
372
373 is($casefold->{code}, '1F88', 'casefold 0x1F88 code');
374 is($casefold->{status}, 'S' , 'casefold 0x1F88 status');
375 is($casefold->{mapping}, '1F80', 'casefold 0x1F88 mapping');
376 is($casefold->{full}, '1F00 03B9', 'casefold 0x1F88 full');
377 is($casefold->{simple}, '1F80', 'casefold 0x1F88 simple');
378 is($casefold->{turkic}, "", 'casefold 0x1F88 turkic');
379
380 ok(!casefold(0x20));
381
382 use Unicode::UCD qw(casespec);
383
384 my $casespec;
385
386 ok(!casespec(0x41));
387
388 $casespec = casespec(0xdf);
389
390 ok($casespec->{code} eq '00DF' &&
391    $casespec->{lower} eq '00DF'  &&
392    $casespec->{title} eq '0053 0073'  &&
393    $casespec->{upper} eq '0053 0053' &&
394    !defined $casespec->{condition}, 'casespec 0xDF');
395
396 $casespec = casespec(0x307);
397
398 ok($casespec->{az}->{code} eq '0307' &&
399    !defined $casespec->{az}->{lower} &&
400    $casespec->{az}->{title} eq '0307'  &&
401    $casespec->{az}->{upper} eq '0307' &&
402    $casespec->{az}->{condition} eq 'az After_I',
403    'casespec 0x307');
404
405 # perl #7305 UnicodeCD::compexcl is weird
406
407 for (1) {my $a=compexcl $_}
408 ok(1, 'compexcl read-only $_: perl #7305');
409 map {compexcl $_} %{{1=>2}};
410 ok(1, 'compexcl read-only hash: perl #7305');
411
412 is(Unicode::UCD::_getcode('123'),     123, "_getcode(123)");
413 is(Unicode::UCD::_getcode('0123'),  0x123, "_getcode(0123)");
414 is(Unicode::UCD::_getcode('0x123'), 0x123, "_getcode(0x123)");
415 is(Unicode::UCD::_getcode('0X123'), 0x123, "_getcode(0X123)");
416 is(Unicode::UCD::_getcode('U+123'), 0x123, "_getcode(U+123)");
417 is(Unicode::UCD::_getcode('u+123'), 0x123, "_getcode(u+123)");
418 is(Unicode::UCD::_getcode('U+1234'),   0x1234, "_getcode(U+1234)");
419 is(Unicode::UCD::_getcode('U+12345'), 0x12345, "_getcode(U+12345)");
420 is(Unicode::UCD::_getcode('123x'),    undef, "_getcode(123x)");
421 is(Unicode::UCD::_getcode('x123'),    undef, "_getcode(x123)");
422 is(Unicode::UCD::_getcode('0x123x'),  undef, "_getcode(x123)");
423 is(Unicode::UCD::_getcode('U+123x'),  undef, "_getcode(x123)");
424
425 {
426     my $r1 = charscript('Latin');
427     my $n1 = @$r1;
428     is($n1, 30, "number of ranges in Latin script (Unicode 6.0.0)");
429     shift @$r1 while @$r1;
430     my $r2 = charscript('Latin');
431     is(@$r2, $n1, "modifying results should not mess up internal caches");
432 }
433
434 {
435         is(charinfo(0xdeadbeef), undef, "[perl #23273] warnings in Unicode::UCD");
436 }
437
438 use Unicode::UCD qw(namedseq);
439
440 is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq");
441 is(namedseq("KATAKANA LETTER AINU Q"), undef);
442 is(namedseq(), undef);
443 is(namedseq(qw(foo bar)), undef);
444 my @ns = namedseq("KATAKANA LETTER AINU P");
445 is(scalar @ns, 2);
446 is($ns[0], 0x31F7);
447 is($ns[1], 0x309A);
448 my %ns = namedseq();
449 is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}");
450 @ns = namedseq(42);
451 is(@ns, 0);
452
453 use Unicode::UCD qw(num);
454 use charnames ":full";
455
456 is(num("0"), 0, 'Verify num("0") == 0');
457 is(num("98765"), 98765, 'Verify num("98765") == 98765');
458 ok(! defined num("98765\N{FULLWIDTH DIGIT FOUR}"), 'Verify num("98765\N{FULLWIDTH DIGIT FOUR}") isnt defined');
459 is(num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}"), 21, 'Verify \N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE DIGIT ONE}" == 21');
460 ok(! defined num("\N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}"), 'Verify \N{NEW TAI LUE DIGIT TWO}\N{NEW TAI LUE THAM DIGIT ONE}" isnt defined');
461 is(num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}"), 3, 'Verify num("\N{CHAM DIGIT ZERO}\N{CHAM DIGIT THREE}") == 3');
462 ok(! defined num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}"), 'Verify num("\N{CHAM DIGIT ZERO}\N{JAVANESE DIGIT NINE}") isnt defined');
463 is(num("\N{SUPERSCRIPT TWO}"), 2, 'Verify num("\N{SUPERSCRIPT TWO} == 2');
464 is(num("\N{ETHIOPIC NUMBER TEN THOUSAND}"), 10000, 'Verify num("\N{ETHIOPIC NUMBER TEN THOUSAND}") == 10000');
465 is(num("\N{NORTH INDIC FRACTION ONE HALF}"), .5, 'Verify num("\N{NORTH INDIC FRACTION ONE HALF}") == .5');
466 is(num("\N{U+12448}"), 9, 'Verify num("\N{U+12448}") == 9');