This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add a regression test for bug #32193, and make the
[perl5.git] / lib / Unicode / UCD.t
1 #!perl -w
2 BEGIN {
3     if (ord("A") == 193) {
4         print "1..0 # Skip: EBCDIC\n";
5         exit 0;
6     }
7     chdir 't' if -d 't';
8     @INC = '../lib';
9     @INC = "::lib" if $^O eq 'MacOS'; # module parses @INC itself
10     require Config; import Config;
11     if ($Config{'extensions'} !~ /\bStorable\b/) {
12         print "1..0 # Skip: Storable was not built; Unicode::UCD uses Storable\n";
13         exit 0;
14     }
15 }
16
17 use strict;
18 use Unicode::UCD;
19 use Test::More;
20
21 BEGIN { plan tests => 179 };
22
23 use Unicode::UCD 'charinfo';
24
25 my $charinfo;
26
27 $charinfo = charinfo(0x41);
28
29 is($charinfo->{code},           '0041', 'LATIN CAPITAL LETTER A');
30 is($charinfo->{name},           'LATIN CAPITAL LETTER A');
31 is($charinfo->{category},       'Lu');
32 is($charinfo->{combining},      '0');
33 is($charinfo->{bidi},           'L');
34 is($charinfo->{decomposition},  '');
35 is($charinfo->{decimal},        '');
36 is($charinfo->{digit},          '');
37 is($charinfo->{numeric},        '');
38 is($charinfo->{mirrored},       'N');
39 is($charinfo->{unicode10},      '');
40 is($charinfo->{comment},        '');
41 is($charinfo->{upper},          '');
42 is($charinfo->{lower},          '0061');
43 is($charinfo->{title},          '');
44 is($charinfo->{block},          'Basic Latin');
45 is($charinfo->{script},         'Latin');
46
47 $charinfo = charinfo(0x100);
48
49 is($charinfo->{code},           '0100', 'LATIN CAPITAL LETTER A WITH MACRON');
50 is($charinfo->{name},           'LATIN CAPITAL LETTER A WITH MACRON');
51 is($charinfo->{category},       'Lu');
52 is($charinfo->{combining},      '0');
53 is($charinfo->{bidi},           'L');
54 is($charinfo->{decomposition},  '0041 0304');
55 is($charinfo->{decimal},        '');
56 is($charinfo->{digit},          '');
57 is($charinfo->{numeric},        '');
58 is($charinfo->{mirrored},       'N');
59 is($charinfo->{unicode10},      'LATIN CAPITAL LETTER A MACRON');
60 is($charinfo->{comment},        '');
61 is($charinfo->{upper},          '');
62 is($charinfo->{lower},          '0101');
63 is($charinfo->{title},          '');
64 is($charinfo->{block},          'Latin Extended-A');
65 is($charinfo->{script},         'Latin');
66
67 # 0x0590 is in the Hebrew block but unused.
68
69 $charinfo = charinfo(0x590);
70
71 is($charinfo->{code},          undef,   '0x0590 - unused Hebrew');
72 is($charinfo->{name},          undef);
73 is($charinfo->{category},      undef);
74 is($charinfo->{combining},     undef);
75 is($charinfo->{bidi},          undef);
76 is($charinfo->{decomposition}, undef);
77 is($charinfo->{decimal},       undef);
78 is($charinfo->{digit},         undef);
79 is($charinfo->{numeric},       undef);
80 is($charinfo->{mirrored},      undef);
81 is($charinfo->{unicode10},     undef);
82 is($charinfo->{comment},       undef);
83 is($charinfo->{upper},         undef);
84 is($charinfo->{lower},         undef);
85 is($charinfo->{title},         undef);
86 is($charinfo->{block},         undef);
87 is($charinfo->{script},        undef);
88
89 # 0x05d0 is in the Hebrew block and used.
90
91 $charinfo = charinfo(0x5d0);
92
93 is($charinfo->{code},           '05D0', '05D0 - used Hebrew');
94 is($charinfo->{name},           'HEBREW LETTER ALEF');
95 is($charinfo->{category},       'Lo');
96 is($charinfo->{combining},      '0');
97 is($charinfo->{bidi},           'R');
98 is($charinfo->{decomposition},  '');
99 is($charinfo->{decimal},        '');
100 is($charinfo->{digit},          '');
101 is($charinfo->{numeric},        '');
102 is($charinfo->{mirrored},       'N');
103 is($charinfo->{unicode10},      '');
104 is($charinfo->{comment},        '');
105 is($charinfo->{upper},          '');
106 is($charinfo->{lower},          '');
107 is($charinfo->{title},          '');
108 is($charinfo->{block},          'Hebrew');
109 is($charinfo->{script},         'Hebrew');
110
111 # An open syllable in Hangul.
112
113 $charinfo = charinfo(0xAC00);
114
115 is($charinfo->{code},           'AC00', 'HANGUL SYLLABLE-AC00');
116 is($charinfo->{name},           'HANGUL SYLLABLE-AC00');
117 is($charinfo->{category},       'Lo');
118 is($charinfo->{combining},      '0');
119 is($charinfo->{bidi},           'L');
120 is($charinfo->{decomposition},  undef);
121 is($charinfo->{decimal},        '');
122 is($charinfo->{digit},          '');
123 is($charinfo->{numeric},        '');
124 is($charinfo->{mirrored},       'N');
125 is($charinfo->{unicode10},      '');
126 is($charinfo->{comment},        '');
127 is($charinfo->{upper},          '');
128 is($charinfo->{lower},          '');
129 is($charinfo->{title},          '');
130 is($charinfo->{block},          'Hangul Syllables');
131 is($charinfo->{script},         'Hangul');
132
133 # A closed syllable in Hangul.
134
135 $charinfo = charinfo(0xAE00);
136
137 is($charinfo->{code},           'AE00', 'HANGUL SYLLABLE-AE00');
138 is($charinfo->{name},           'HANGUL SYLLABLE-AE00');
139 is($charinfo->{category},       'Lo');
140 is($charinfo->{combining},      '0');
141 is($charinfo->{bidi},           'L');
142 is($charinfo->{decomposition},  undef);
143 is($charinfo->{decimal},        '');
144 is($charinfo->{digit},          '');
145 is($charinfo->{numeric},        '');
146 is($charinfo->{mirrored},       'N');
147 is($charinfo->{unicode10},      '');
148 is($charinfo->{comment},        '');
149 is($charinfo->{upper},          '');
150 is($charinfo->{lower},          '');
151 is($charinfo->{title},          '');
152 is($charinfo->{block},          'Hangul Syllables');
153 is($charinfo->{script},         'Hangul');
154
155 $charinfo = charinfo(0x1D400);
156
157 is($charinfo->{code},           '1D400', 'MATHEMATICAL BOLD CAPITAL A');
158 is($charinfo->{name},           'MATHEMATICAL BOLD CAPITAL A');
159 is($charinfo->{category},       'Lu');
160 is($charinfo->{combining},      '0');
161 is($charinfo->{bidi},           'L');
162 is($charinfo->{decomposition},  '<font> 0041');
163 is($charinfo->{decimal},        '');
164 is($charinfo->{digit},          '');
165 is($charinfo->{numeric},        '');
166 is($charinfo->{mirrored},       'N');
167 is($charinfo->{unicode10},      '');
168 is($charinfo->{comment},        '');
169 is($charinfo->{upper},          '');
170 is($charinfo->{lower},          '');
171 is($charinfo->{title},          '');
172 is($charinfo->{block},          'Mathematical Alphanumeric Symbols');
173 is($charinfo->{script},         'Common');
174
175 use Unicode::UCD qw(charblock charscript);
176
177 # 0x0590 is in the Hebrew block but unused.
178
179 is(charblock(0x590),          'Hebrew', '0x0590 - Hebrew unused charblock');
180 is(charscript(0x590),         undef,    '0x0590 - Hebrew unused charscript');
181
182 $charinfo = charinfo(0xbe);
183
184 is($charinfo->{code},           '00BE', 'VULGAR FRACTION THREE QUARTERS');
185 is($charinfo->{name},           'VULGAR FRACTION THREE QUARTERS');
186 is($charinfo->{category},       'No');
187 is($charinfo->{combining},      '0');
188 is($charinfo->{bidi},           'ON');
189 is($charinfo->{decomposition},  '<fraction> 0033 2044 0034');
190 is($charinfo->{decimal},        '');
191 is($charinfo->{digit},          '');
192 is($charinfo->{numeric},        '3/4');
193 is($charinfo->{mirrored},       'N');
194 is($charinfo->{unicode10},      'FRACTION THREE QUARTERS');
195 is($charinfo->{comment},        '');
196 is($charinfo->{upper},          '');
197 is($charinfo->{lower},          '');
198 is($charinfo->{title},          '');
199 is($charinfo->{block},          'Latin-1 Supplement');
200 is($charinfo->{script},         'Common');
201
202 use Unicode::UCD qw(charblocks charscripts);
203
204 my $charblocks = charblocks();
205
206 ok(exists $charblocks->{Thai}, 'Thai charblock exists');
207 is($charblocks->{Thai}->[0]->[0], hex('0e00'));
208 ok(!exists $charblocks->{PigLatin}, 'PigLatin charblock does not exist');
209
210 my $charscripts = charscripts();
211
212 ok(exists $charscripts->{Armenian}, 'Armenian charscript exists');
213 is($charscripts->{Armenian}->[0]->[0], hex('0531'));
214 ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist');
215
216 my $charscript;
217
218 $charscript = charscript("12ab");
219 is($charscript, 'Ethiopic', 'Ethiopic charscript');
220
221 $charscript = charscript("0x12ab");
222 is($charscript, 'Ethiopic');
223
224 $charscript = charscript("U+12ab");
225 is($charscript, 'Ethiopic');
226
227 my $ranges;
228
229 $ranges = charscript('Ogham');
230 is($ranges->[1]->[0], hex('1681'), 'Ogham charscript');
231 is($ranges->[1]->[1], hex('169a'));
232
233 use Unicode::UCD qw(charinrange);
234
235 $ranges = charscript('Cherokee');
236 ok(!charinrange($ranges, "139f"), 'Cherokee charscript');
237 ok( charinrange($ranges, "13a0"));
238 ok( charinrange($ranges, "13f4"));
239 ok(!charinrange($ranges, "13f5"));
240
241 is(Unicode::UCD::UnicodeVersion, '4.0.1', 'UnicodeVersion');
242
243 use Unicode::UCD qw(compexcl);
244
245 ok(!compexcl(0x0100), 'compexcl');
246 ok( compexcl(0x0958));
247
248 use Unicode::UCD qw(casefold);
249
250 my $casefold;
251
252 $casefold = casefold(0x41);
253
254 ok($casefold->{code} eq '0041' &&
255    $casefold->{status} eq 'C'  &&
256    $casefold->{mapping} eq '0061', 'casefold 0x41');
257
258 $casefold = casefold(0xdf);
259
260 ok($casefold->{code} eq '00DF' &&
261    $casefold->{status} eq 'F'  &&
262    $casefold->{mapping} eq '0073 0073', 'casefold 0xDF');
263
264 ok(!casefold(0x20));
265
266 use Unicode::UCD qw(casespec);
267
268 my $casespec;
269
270 ok(!casespec(0x41));
271
272 $casespec = casespec(0xdf);
273
274 ok($casespec->{code} eq '00DF' &&
275    $casespec->{lower} eq '00DF'  &&
276    $casespec->{title} eq '0053 0073'  &&
277    $casespec->{upper} eq '0053 0053' &&
278    !defined $casespec->{condition}, 'casespec 0xDF');
279
280 $casespec = casespec(0x307);
281
282 ok($casespec->{az}->{code} eq '0307' &&
283    !defined $casespec->{az}->{lower} &&
284    $casespec->{az}->{title} eq '0307'  &&
285    $casespec->{az}->{upper} eq '0307' &&
286    $casespec->{az}->{condition} eq 'az After_I',
287    'casespec 0x307');
288
289 # perl #7305 UnicodeCD::compexcl is weird
290
291 for (1) {my $a=compexcl $_}
292 ok(1, 'compexcl read-only $_: perl #7305');
293 grep {compexcl $_} %{{1=>2}};
294 ok(1, 'compexcl read-only hash: perl #7305');
295
296 is(Unicode::UCD::_getcode('123'),     123, "_getcode(123)");
297 is(Unicode::UCD::_getcode('0123'),  0x123, "_getcode(0123)");
298 is(Unicode::UCD::_getcode('0x123'), 0x123, "_getcode(0x123)");
299 is(Unicode::UCD::_getcode('0X123'), 0x123, "_getcode(0X123)");
300 is(Unicode::UCD::_getcode('U+123'), 0x123, "_getcode(U+123)");
301 is(Unicode::UCD::_getcode('u+123'), 0x123, "_getcode(u+123)");
302 is(Unicode::UCD::_getcode('U+1234'),   0x1234, "_getcode(U+1234)");
303 is(Unicode::UCD::_getcode('U+12345'), 0x12345, "_getcode(U+12345)");
304 is(Unicode::UCD::_getcode('123x'),    undef, "_getcode(123x)");
305 is(Unicode::UCD::_getcode('x123'),    undef, "_getcode(x123)");
306 is(Unicode::UCD::_getcode('0x123x'),  undef, "_getcode(x123)");
307 is(Unicode::UCD::_getcode('U+123x'),  undef, "_getcode(x123)");
308
309 {
310     my $r1 = charscript('Latin');
311     my $n1 = @$r1;
312     is($n1, 26, "26 ranges in Latin script (Unicode 4.0.0)");
313     shift @$r1 while @$r1;
314     my $r2 = charscript('Latin');
315     is(@$r2, $n1, "modifying results should not mess up internal caches");
316 }
317
318 {
319         is(charinfo(0xdeadbeef), undef, "[perl #23273] warnings in Unicode::UCD");
320 }