This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade Unicode::Normalize from version 1.19 to 1.21
[perl5.git] / cpan / Unicode-Normalize / t / func.t
CommitLineData
ac5ea531 1
4a2e806c 2BEGIN {
65e3945f
CBW
3 unless ('A' eq pack('U', 0x41)) {
4 print "1..0 # Unicode::Normalize cannot pack a Unicode code point\n";
5 exit 0;
6 }
7 unless (0x41 == unpack('U', 'A')) {
8 print "1..0 # Unicode::Normalize cannot get a Unicode code point\n";
f027f502 9 exit 0;
4a2e806c
JH
10 }
11}
12
6c941e0c
JH
13BEGIN {
14 if ($ENV{PERL_CORE}) {
95f3e8d2
SH
15 chdir('t') if -d 't';
16 @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
6c941e0c
JH
17 }
18}
19
ac5ea531
JH
20#########################
21
ac5ea531
JH
22use strict;
23use warnings;
b7657688
A
24BEGIN { $| = 1; print "1..217\n"; }
25my $count = 0;
26sub ok ($;$) {
27 my $p = my $r = shift;
28 if (@_) {
29 my $x = shift;
30 $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x;
31 }
32 print $p ? "ok" : "not ok", ' ', ++$count, "\n";
33}
34
ac5ea531 35use Unicode::Normalize qw(:all);
b7657688
A
36
37ok(1);
ac5ea531 38
fe067ad9
SP
39sub _pack_U { Unicode::Normalize::pack_U(@_) }
40sub hexU { _pack_U map hex, split ' ', shift }
6c941e0c 41
3ece07bc
SH
42# This won't work on EBCDIC platforms prior to v5.8.0, which is when this
43# translation function was defined
44*to_native = (defined &utf8::unicode_to_native)
45 ? \&utf8::unicode_to_native
46 : sub { return shift };
47
ac5ea531
JH
48#########################
49
3ece07bc
SH
50ok(getCombinClass( to_native(0)), 0);
51ok(getCombinClass(to_native(41)), 0);
52ok(getCombinClass(to_native(65)), 0);
fe067ad9
SP
53ok(getCombinClass( 768), 230);
54ok(getCombinClass(1809), 36);
55
3ece07bc
SH
56ok(getCanon(to_native( 0)), undef);
57ok(getCanon(to_native(0x29)), undef);
58ok(getCanon(to_native(0x41)), undef);
59ok(getCanon(to_native(0x00C0)), _pack_U(0x0041, 0x0300));
60ok(getCanon(to_native(0x00EF)), _pack_U(0x0069, 0x0308));
fe067ad9
SP
61ok(getCanon(0x304C), _pack_U(0x304B, 0x3099));
62ok(getCanon(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301));
63ok(getCanon(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345));
64ok(getCanon(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345));
65ok(getCanon(0xAC00), _pack_U(0x1100, 0x1161));
66ok(getCanon(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF));
67ok(getCanon(0x212C), undef);
68ok(getCanon(0x3243), undef);
69ok(getCanon(0xFA2D), _pack_U(0x9DB4));
70
e0a65de5
CBW
71# 20
72
3ece07bc
SH
73ok(getCompat(to_native( 0)), undef);
74ok(getCompat(to_native(0x29)), undef);
75ok(getCompat(to_native(0x41)), undef);
76ok(getCompat(to_native(0x00C0)), _pack_U(0x0041, 0x0300));
77ok(getCompat(to_native(0x00EF)), _pack_U(0x0069, 0x0308));
fe067ad9
SP
78ok(getCompat(0x304C), _pack_U(0x304B, 0x3099));
79ok(getCompat(0x1EA4), _pack_U(0x0041, 0x0302, 0x0301));
80ok(getCompat(0x1F82), _pack_U(0x03B1, 0x0313, 0x0300, 0x0345));
81ok(getCompat(0x1FAF), _pack_U(0x03A9, 0x0314, 0x0342, 0x0345));
82ok(getCompat(0x212C), _pack_U(0x0042));
83ok(getCompat(0x3243), _pack_U(0x0028, 0x81F3, 0x0029));
84ok(getCompat(0xAC00), _pack_U(0x1100, 0x1161));
85ok(getCompat(0xAE00), _pack_U(0x1100, 0x1173, 0x11AF));
86ok(getCompat(0xFA2D), _pack_U(0x9DB4));
87
b7657688
A
88# 34
89
3ece07bc
SH
90ok(getComposite(to_native( 0), to_native( 0)), undef);
91ok(getComposite(to_native( 0), to_native(0x29)), undef);
92ok(getComposite(to_native(0x29), to_native( 0)), undef);
93ok(getComposite(to_native(0x29), to_native(0x29)), undef);
94ok(getComposite(to_native( 0), to_native(0x41)), undef);
95ok(getComposite(to_native(0x41), to_native( 0)), undef);
96ok(getComposite(to_native(0x41), to_native(0x41)), undef);
97ok(getComposite(to_native(12), to_native(0x0300)), undef);
98ok(getComposite(to_native(0x0055), 0xFF00), undef);
99ok(getComposite(to_native(0x0041), 0x0300), to_native(0x00C0));
100ok(getComposite(to_native(0x0055), 0x0300), to_native(0x00D9));
fe067ad9
SP
101ok(getComposite(0x0112, 0x0300), 0x1E14);
102ok(getComposite(0x1100, 0x1161), 0xAC00);
103ok(getComposite(0x1100, 0x1173), 0xADF8);
104ok(getComposite(0x1100, 0x11AF), undef);
105ok(getComposite(0x1173, 0x11AF), undef);
106ok(getComposite(0xAC00, 0x11A7), undef);
107ok(getComposite(0xAC00, 0x11A8), 0xAC01);
108ok(getComposite(0xADF8, 0x11AF), 0xAE00);
109
e0a65de5
CBW
110# 53
111
fe067ad9
SP
112sub uprops {
113 my $uv = shift;
114 my $r = "";
115 $r .= isExclusion($uv) ? 'X' : 'x';
116 $r .= isSingleton($uv) ? 'S' : 's';
117 $r .= isNonStDecomp($uv) ? 'N' : 'n'; # Non-Starter Decomposition
118 $r .= isComp_Ex($uv) ? 'F' : 'f'; # Full exclusion (X + S + N)
119 $r .= isComp2nd($uv) ? 'B' : 'b'; # B = M = Y
120 $r .= isNFD_NO($uv) ? 'D' : 'd';
121 $r .= isNFC_MAYBE($uv) ? 'M' : 'm'; # Maybe
122 $r .= isNFC_NO($uv) ? 'C' : 'c';
123 $r .= isNFKD_NO($uv) ? 'K' : 'k';
124 $r .= isNFKC_MAYBE($uv) ? 'Y' : 'y'; # maYbe
125 $r .= isNFKC_NO($uv) ? 'G' : 'g';
126 return $r;
127}
128
3ece07bc
SH
129ok(uprops(to_native(0x0000)), 'xsnfbdmckyg'); # NULL
130ok(uprops(to_native(0x0029)), 'xsnfbdmckyg'); # RIGHT PARENTHESIS
131ok(uprops(to_native(0x0041)), 'xsnfbdmckyg'); # LATIN CAPITAL LETTER A
132ok(uprops(to_native(0x00A0)), 'xsnfbdmcKyG'); # NO-BREAK SPACE
133ok(uprops(to_native(0x00C0)), 'xsnfbDmcKyg'); # LATIN CAPITAL LETTER A WITH GRAVE
fe067ad9
SP
134ok(uprops(0x0300), 'xsnfBdMckYg'); # COMBINING GRAVE ACCENT
135ok(uprops(0x0344), 'xsNFbDmCKyG'); # COMBINING GREEK DIALYTIKA TONOS
136ok(uprops(0x0387), 'xSnFbDmCKyG'); # GREEK ANO TELEIA
137ok(uprops(0x0958), 'XsnFbDmCKyG'); # DEVANAGARI LETTER QA
138ok(uprops(0x0F43), 'XsnFbDmCKyG'); # TIBETAN LETTER GHA
139ok(uprops(0x1100), 'xsnfbdmckyg'); # HANGUL CHOSEONG KIYEOK
140ok(uprops(0x1161), 'xsnfBdMckYg'); # HANGUL JUNGSEONG A
39f4556f 141ok(uprops(0x11AF), 'xsnfBdMckYg'); # HANGUL JONGSEONG RIEUL
fe067ad9
SP
142ok(uprops(0x212B), 'xSnFbDmCKyG'); # ANGSTROM SIGN
143ok(uprops(0xAC00), 'xsnfbDmcKyg'); # HANGUL SYLLABLE GA
144ok(uprops(0xF900), 'xSnFbDmCKyG'); # CJK COMPATIBILITY IDEOGRAPH-F900
145ok(uprops(0xFB4E), 'XsnFbDmCKyG'); # HEBREW LETTER PE WITH RAFE
146ok(uprops(0xFF71), 'xsnfbdmcKyG'); # HALFWIDTH KATAKANA LETTER A
147
e0a65de5
CBW
148# 71
149
fe067ad9
SP
150ok(decompose(""), "");
151ok(decompose("A"), "A");
152ok(decompose("", 1), "");
153ok(decompose("A", 1), "A");
154
155ok(decompose(hexU("1E14 AC01")), hexU("0045 0304 0300 1100 1161 11A8"));
156ok(decompose(hexU("AC00 AE00")), hexU("1100 1161 1100 1173 11AF"));
157ok(decompose(hexU("304C FF76")), hexU("304B 3099 FF76"));
158
159ok(decompose(hexU("1E14 AC01"), 1), hexU("0045 0304 0300 1100 1161 11A8"));
160ok(decompose(hexU("AC00 AE00"), 1), hexU("1100 1161 1100 1173 11AF"));
161ok(decompose(hexU("304C FF76"), 1), hexU("304B 3099 30AB"));
162
163# don't modify the source
164my $sDec = "\x{FA19}";
165ok(decompose($sDec), "\x{795E}");
166ok($sDec, "\x{FA19}");
167
b7657688
A
168# 83
169
fe067ad9
SP
170ok(reorder(""), "");
171ok(reorder("A"), "A");
172ok(reorder(hexU("0041 0300 0315 0313 031b 0061")),
173 hexU("0041 031b 0300 0313 0315 0061"));
174ok(reorder(hexU("00C1 0300 0315 0313 031b 0061 309A 3099")),
175 hexU("00C1 031b 0300 0313 0315 0061 309A 3099"));
176
177# don't modify the source
178my $sReord = "\x{3000}\x{300}\x{31b}";
179ok(reorder($sReord), "\x{3000}\x{31b}\x{300}");
180ok($sReord, "\x{3000}\x{300}\x{31b}");
181
b7657688
A
182# 89
183
fe067ad9
SP
184ok(compose(""), "");
185ok(compose("A"), "A");
186ok(compose(hexU("0061 0300")), hexU("00E0"));
187ok(compose(hexU("0061 0300 031B")), hexU("00E0 031B"));
188ok(compose(hexU("0061 0300 0315")), hexU("00E0 0315"));
189ok(compose(hexU("0061 0300 0313")), hexU("00E0 0313"));
190ok(compose(hexU("0061 031B 0300")), hexU("00E0 031B"));
191ok(compose(hexU("0061 0315 0300")), hexU("0061 0315 0300"));
192ok(compose(hexU("0061 0313 0300")), hexU("0061 0313 0300"));
193
194# don't modify the source
195my $sCom = "\x{304B}\x{3099}";
196ok(compose($sCom), "\x{304C}");
197ok($sCom, "\x{304B}\x{3099}");
198
e0a65de5
CBW
199# 100
200
fe067ad9
SP
201ok(composeContiguous(""), "");
202ok(composeContiguous("A"), "A");
203ok(composeContiguous(hexU("0061 0300")), hexU("00E0"));
204ok(composeContiguous(hexU("0061 0300 031B")), hexU("00E0 031B"));
205ok(composeContiguous(hexU("0061 0300 0315")), hexU("00E0 0315"));
206ok(composeContiguous(hexU("0061 0300 0313")), hexU("00E0 0313"));
207ok(composeContiguous(hexU("0061 031B 0300")), hexU("0061 031B 0300"));
208ok(composeContiguous(hexU("0061 0315 0300")), hexU("0061 0315 0300"));
209ok(composeContiguous(hexU("0061 0313 0300")), hexU("0061 0313 0300"));
210
211# don't modify the source
212my $sCtg = "\x{30DB}\x{309A}";
213ok(composeContiguous($sCtg), "\x{30DD}");
214ok($sCtg, "\x{30DB}\x{309A}");
8f118dcd 215
e0a65de5
CBW
216# 111
217
8f118dcd
JH
218sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" }
219
fe067ad9
SP
220ok(answer(checkNFD("")), "YES");
221ok(answer(checkNFC("")), "YES");
222ok(answer(checkNFKD("")), "YES");
223ok(answer(checkNFKC("")), "YES");
224ok(answer(check("NFD", "")), "YES");
225ok(answer(check("NFC", "")), "YES");
226ok(answer(check("NFKD","")), "YES");
227ok(answer(check("NFKC","")), "YES");
228
8f118dcd 229# U+0000 to U+007F are prenormalized in all the normalization forms.
fe067ad9
SP
230ok(answer(checkNFD("AZaz\t12!#`")), "YES");
231ok(answer(checkNFC("AZaz\t12!#`")), "YES");
232ok(answer(checkNFKD("AZaz\t12!#`")), "YES");
233ok(answer(checkNFKC("AZaz\t12!#`")), "YES");
234ok(answer(check("D", "AZaz\t12!#`")), "YES");
235ok(answer(check("C", "AZaz\t12!#`")), "YES");
236ok(answer(check("KD","AZaz\t12!#`")), "YES");
237ok(answer(check("KC","AZaz\t12!#`")), "YES");
238
239ok(answer(checkNFD(NFD(_pack_U(0xC1, 0x1100, 0x1173, 0x11AF)))), "YES");
2b8d773d
RGS
240ok(answer(checkNFD(hexU("20 C1 1100 1173 11AF"))), "NO");
241ok(answer(checkNFC(hexU("20 C1 1173 11AF"))), "MAYBE");
242ok(answer(checkNFC(hexU("20 C1 AE00 1100"))), "YES");
243ok(answer(checkNFC(hexU("20 C1 AE00 1100 0300"))), "MAYBE");
244ok(answer(checkNFC(hexU("212B 1100 0300"))), "NO");
245ok(answer(checkNFC(hexU("1100 0300 212B"))), "NO");
246ok(answer(checkNFC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring
247ok(answer(checkNFC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla
248ok(answer(checkNFC(hexU("20 C1 FF71 2025"))),"YES");
249ok(answer(check("NFC", hexU("20 C1 212B 300"))), "NO");
250ok(answer(checkNFKD(hexU("20 C1 FF71 2025"))), "NO");
251ok(answer(checkNFKC(hexU("20 C1 AE00 2025"))), "NO");
252ok(answer(checkNFKC(hexU("212B 1100 0300"))), "NO");
253ok(answer(checkNFKC(hexU("1100 0300 212B"))), "NO");
254ok(answer(checkNFKC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring
255ok(answer(checkNFKC(hexU("0041 030A 0327"))), "NO"); # A+ring+cedilla
256ok(answer(check("NFKC", hexU("20 C1 212B 300"))), "NO");
f027f502 257
e0a65de5
CBW
258# 145
259
f027f502 260"012ABC" =~ /(\d+)(\w+)/;
fe067ad9 261ok("012" eq NFC $1 && "ABC" eq NFC $2);
f027f502 262
fe067ad9
SP
263ok(normalize('C', $1), "012");
264ok(normalize('C', $2), "ABC");
f027f502 265
fe067ad9
SP
266ok(normalize('NFC', $1), "012");
267ok(normalize('NFC', $2), "ABC");
f027f502
JH
268 # s/^NF// in normalize() must not prevent using $1, $&, etc.
269
b7657688
A
270# 150
271
fe067ad9
SP
272# a string with initial zero should be treated like a number
273
274# LATIN CAPITAL LETTER A WITH GRAVE
3ece07bc
SH
275ok(getCombinClass(sprintf("0%d", to_native(192))), 0);
276ok(getCanon (sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300));
277ok(getCompat(sprintf("0%d", to_native(192))), _pack_U(0x41, 0x300));
278my $lead_zero = sprintf "0%d", to_native(65);
279ok(getComposite($lead_zero, "0768"), to_native(192));
280ok(isNFD_NO (sprintf("0%d", to_native(192))));
281ok(isNFKD_NO(sprintf("0%d", to_native(192))));
fe067ad9
SP
282
283# DEVANAGARI LETTER QA
284ok(isExclusion("02392"));
285ok(isComp_Ex ("02392"));
286ok(isNFC_NO ("02392"));
287ok(isNFKC_NO ("02392"));
288ok(isNFD_NO ("02392"));
289ok(isNFKD_NO ("02392"));
290
291# ANGSTROM SIGN
292ok(isSingleton("08491"));
293ok(isComp_Ex ("08491"));
294ok(isNFC_NO ("08491"));
295ok(isNFKC_NO ("08491"));
296ok(isNFD_NO ("08491"));
297ok(isNFKD_NO ("08491"));
298
299# COMBINING GREEK DIALYTIKA TONOS
300ok(isNonStDecomp("0836"));
301ok(isComp_Ex ("0836"));
302ok(isNFC_NO ("0836"));
303ok(isNFKC_NO ("0836"));
304ok(isNFD_NO ("0836"));
305ok(isNFKD_NO ("0836"));
306
307# COMBINING GRAVE ACCENT
308ok(getCombinClass("0768"), 230);
309ok(isComp2nd ("0768"));
310ok(isNFC_MAYBE ("0768"));
311ok(isNFKC_MAYBE("0768"));
312
313# HANGUL SYLLABLE GA
314ok(getCombinClass("044032"), 0);
315ok(getCanon("044032"), _pack_U(0x1100, 0x1161));
316ok(getCompat("044032"), _pack_U(0x1100, 0x1161));
317ok(getComposite("04352", "04449"), 0xAC00);
318
e0a65de5
CBW
319# 182
320
fe067ad9
SP
321# string with 22 combining characters: (0x300..0x315)
322my $str_cc22 = _pack_U(0x3041, 0x300..0x315, 0x3042);
323ok(decompose($str_cc22), $str_cc22);
324ok(reorder($str_cc22), $str_cc22);
325ok(compose($str_cc22), $str_cc22);
326ok(composeContiguous($str_cc22), $str_cc22);
327ok(NFD($str_cc22), $str_cc22);
328ok(NFC($str_cc22), $str_cc22);
329ok(NFKD($str_cc22), $str_cc22);
330ok(NFKC($str_cc22), $str_cc22);
331ok(FCD($str_cc22), $str_cc22);
332ok(FCC($str_cc22), $str_cc22);
333
b7657688
A
334# 192
335
fe067ad9
SP
336# string with 40 combining characters of the same class: (0x300..0x313)x2
337my $str_cc40 = _pack_U(0x3041, 0x300..0x313, 0x300..0x313, 0x3042);
338ok(decompose($str_cc40), $str_cc40);
339ok(reorder($str_cc40), $str_cc40);
340ok(compose($str_cc40), $str_cc40);
341ok(composeContiguous($str_cc40), $str_cc40);
342ok(NFD($str_cc40), $str_cc40);
343ok(NFC($str_cc40), $str_cc40);
344ok(NFKD($str_cc40), $str_cc40);
345ok(NFKC($str_cc40), $str_cc40);
346ok(FCD($str_cc40), $str_cc40);
347ok(FCC($str_cc40), $str_cc40);
348
e0a65de5
CBW
349# 202
350
fe067ad9
SP
351my $precomp = hexU("304C 304E 3050 3052 3054");
352my $combseq = hexU("304B 3099 304D 3099 304F 3099 3051 3099 3053 3099");
353ok(decompose($precomp x 5), $combseq x 5);
354ok(decompose($precomp x 10), $combseq x 10);
355ok(decompose($precomp x 20), $combseq x 20);
356
357my $hangsyl = hexU("AC00 B098 B2E4 B77C B9C8");
358my $jamoseq = hexU("1100 1161 1102 1161 1103 1161 1105 1161 1106 1161");
359ok(decompose($hangsyl x 5), $jamoseq x 5);
360ok(decompose($hangsyl x 10), $jamoseq x 10);
361ok(decompose($hangsyl x 20), $jamoseq x 20);
362
363my $notcomp = hexU("304B 304D 304F 3051 3053");
364ok(decompose($precomp . $notcomp), $combseq . $notcomp);
365ok(decompose($precomp . $notcomp x 5), $combseq . $notcomp x 5);
366ok(decompose($precomp . $notcomp x10), $combseq . $notcomp x10);
367
e0a65de5
CBW
368# 211
369
370my $preUnicode3_1 = !defined getCanon(0x1D15E);
371my $preUnicode3_2 = !defined getCanon(0x2ADC);
372
373# HEBREW LETTER YOD WITH HIRIQ
374ok($preUnicode3_1 xor isExclusion(0xFB1D));
375ok($preUnicode3_1 xor isComp_Ex (0xFB1D));
376
377# MUSICAL SYMBOL HALF NOTE
378ok($preUnicode3_1 xor isExclusion(0x1D15E));
379ok($preUnicode3_1 xor isComp_Ex (0x1D15E));
380
381# FORKING
382ok($preUnicode3_2 xor isExclusion(0x2ADC));
383ok($preUnicode3_2 xor isComp_Ex (0x2ADC));
fe067ad9 384
e0a65de5 385# 217
b7657688 386