d9f7db9b106eec8d59c04ff16d4adc525b39a883
[perl.git] / ext / Unicode-Collate / t / hangul.t
1 BEGIN {
2     unless ("A" eq pack('U', 0x41)) {
3         print "1..0 # Unicode::Collate " .
4             "cannot stringify a Unicode code point\n";
5         exit 0;
6     }
7     if ($ENV{PERL_CORE}) {
8         chdir('t') if -d 't';
9         @INC = $^O eq 'MacOS' ? qw(::lib) : qw(../lib);
10     }
11 }
12
13 use Test;
14 BEGIN { plan tests => 72 };
15
16 use strict;
17 use warnings;
18 use Unicode::Collate;
19
20 #########################
21
22 ok(1);
23
24 # a standard collator (3.1.1)
25 my $Collator = Unicode::Collate->new(
26   table => 'keys.txt',
27   normalization => undef,
28 );
29
30
31 # a collator for hangul sorting,
32 # cf. http://std.dkuug.dk/JTC1/SC22/WG20/docs/documents.html
33 #     http://std.dkuug.dk/JTC1/SC22/WG20/docs/n1051-hangulsort.pdf
34 my $hangul = Unicode::Collate->new(
35   level => 3,
36   table => undef,
37   normalization => undef,
38
39   entry => <<'ENTRIES',
40 0061      ; [.0A15.0020.0002] # LATIN SMALL LETTER A
41 0041      ; [.0A15.0020.0008] # LATIN CAPITAL LETTER A
42 #1161     ; [.1800.0020.0002] # <comment> initial jungseong A
43 #1163     ; [.1801.0020.0002] # <comment> initial jungseong YA
44 1100      ; [.1831.0020.0002] # choseong KIYEOK
45 1100 1161 ; [.1831.0020.0002][.1800.0020.0002] # G-A
46 1100 1163 ; [.1831.0020.0002][.1801.0020.0002] # G-YA
47 1101      ; [.1831.0020.0002][.1831.0020.0002] # choseong SSANGKIYEOK
48 1101 1161 ; [.1831.0020.0002][.1831.0020.0002][.1800.0020.0002] # GG-A
49 1101 1163 ; [.1831.0020.0002][.1831.0020.0002][.1801.0020.0002] # GG-YA
50 1102      ; [.1833.0020.0002] # choseong NIEUN
51 1102 1161 ; [.1833.0020.0002][.1800.0020.0002] # N-A
52 1102 1163 ; [.1833.0020.0002][.1801.0020.0002] # N-YA
53 3042      ; [.1921.0020.000E] # HIRAGANA LETTER A
54 11A8      ; [.FE10.0020.0002] # jongseong KIYEOK
55 11A9      ; [.FE10.0020.0002][.FE10.0020.0002] # jongseong SSANGKIYEOK
56 1161      ; [.FE20.0020.0002] # jungseong A <non-initial>
57 1163      ; [.FE21.0020.0002] # jungseong YA <non-initial>
58 ENTRIES
59 );
60
61 ok(ref $hangul, "Unicode::Collate");
62
63 my $trailwt = Unicode::Collate->new(
64   level => 3,
65   table => undef,
66   normalization => undef,
67   hangul_terminator => 16,
68
69   entry => <<'ENTRIES', # Term < Jongseong < Jungseong < Choseong
70 0061  ; [.0A15.0020.0002] # LATIN SMALL LETTER A
71 0041  ; [.0A15.0020.0008] # LATIN CAPITAL LETTER A
72 11A8  ; [.1801.0020.0002] # HANGUL JONGSEONG KIYEOK
73 11A9  ; [.1801.0020.0002][.1801.0020.0002] # HANGUL JONGSEONG SSANGKIYEOK
74 1161  ; [.1831.0020.0002] # HANGUL JUNGSEONG A
75 1163  ; [.1832.0020.0002] # HANGUL JUNGSEONG YA
76 1100  ; [.1861.0020.0002] # HANGUL CHOSEONG KIYEOK
77 1101  ; [.1861.0020.0002][.1861.0020.0002] # HANGUL CHOSEONG SSANGKIYEOK
78 1102  ; [.1862.0020.0002] # HANGUL CHOSEONG NIEUN
79 3042  ; [.1921.0020.000E] # HIRAGANA LETTER A
80 ENTRIES
81 );
82
83 #########################
84
85 # L(simp)L(simp) vs L(comp): /GGA/
86 ok($Collator->lt("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}"));
87 ok($hangul  ->eq("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}"));
88 ok($trailwt ->eq("\x{1100}\x{1100}\x{1161}", "\x{1101}\x{1161}"));
89
90 # L(simp) vs L(simp)L(simp): /GA/ vs /GGA/
91 ok($Collator->gt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}"));
92 ok($hangul  ->lt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}"));
93 ok($trailwt ->lt("\x{1100}\x{1161}", "\x{1100}\x{1100}\x{1161}"));
94
95 # T(simp)T(simp) vs T(comp): /AGG/
96 ok($Collator->lt("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}"));
97 ok($hangul  ->eq("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}"));
98 ok($trailwt ->eq("\x{1161}\x{11A8}\x{11A8}", "\x{1161}\x{11A9}"));
99
100 # T(simp) vs T(simp)T(simp): /AG/ vs /AGG/
101 ok($Collator->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}"));
102 ok($hangul  ->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}"));
103 ok($trailwt ->lt("\x{1161}\x{11A8}", "\x{1161}\x{11A8}\x{11A8}"));
104
105 # LV vs LLV: /GA/ vs /GNA/
106 ok($Collator->gt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}"));
107 ok($hangul  ->lt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}"));
108 ok($trailwt ->lt("\x{1100}\x{1161}", "\x{1100}\x{1102}\x{1161}"));
109
110 # LVX vs LVV: /GAA/ vs /GA/.latinA
111 ok($Collator->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A"));
112 ok($hangul  ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A"));
113 ok($trailwt ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}A"));
114
115 # LVX vs LVV: /GAA/ vs /GA/.hiraganaA
116 ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}"));
117 ok($hangul  ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}"));
118 ok($trailwt ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{3042}"));
119
120 # LVX vs LVV: /GAA/ vs /GA/.hanja
121 ok($Collator->lt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}"));
122 ok($hangul  ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}"));
123 ok($trailwt ->gt("\x{1100}\x{1161}\x{1161}", "\x{1100}\x{1161}\x{4E00}"));
124
125 # LVL vs LVT: /GA/./G/ vs /GAG/
126 ok($Collator->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}"));
127 ok($hangul  ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}"));
128 ok($trailwt ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{11A8}"));
129
130 # LVT vs LVX: /GAG/ vs /GA/.latinA
131 ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A"));
132 ok($hangul  ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A"));
133 ok($trailwt ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}A"));
134
135 # LVT vs LVX: /GAG/ vs /GA/.hiraganaA
136 ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}"));
137 ok($hangul  ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}"));
138 ok($trailwt ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{3042}"));
139
140 # LVT vs LVX: /GAG/ vs /GA/.hanja
141 ok($Collator->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
142 ok($hangul  ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
143 ok($trailwt ->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{4E00}"));
144
145 # LVT vs LVV: /GAG/ vs /GAA/
146 ok($Collator->gt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}"));
147 ok($hangul  ->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}"));
148 ok($trailwt ->lt("\x{1100}\x{1161}\x{11A8}", "\x{1100}\x{1161}\x{1161}"));
149
150 # LVL vs LVV: /GA/./G/ vs /GAA/
151 ok($Collator->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}"));
152 ok($hangul  ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}"));
153 ok($trailwt ->lt("\x{1100}\x{1161}\x{1100}", "\x{1100}\x{1161}\x{1161}"));
154
155 # LV vs Syl(LV): /GA/ vs /[GA]/
156 ok($Collator->eq("\x{1100}\x{1161}", "\x{AC00}"));
157 ok($hangul  ->eq("\x{1100}\x{1161}", "\x{AC00}"));
158 ok($trailwt ->eq("\x{1100}\x{1161}", "\x{AC00}"));
159
160 # LVT vs Syl(LV)T: /GAG/ vs /[GA]G/
161 ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
162 ok($hangul  ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
163 ok($trailwt ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC00}\x{11A8}"));
164
165 # LVT vs Syl(LVT): /GAG/ vs /[GAG]/
166 ok($Collator->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
167 ok($hangul  ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
168 ok($trailwt ->eq("\x{1100}\x{1161}\x{11A8}", "\x{AC01}"));
169
170 # LVTT vs Syl(LVTT): /GAGG/ vs /[GAGG]/
171 ok($Collator->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
172 ok($hangul  ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
173 ok($trailwt ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
174
175 # LVTT vs Syl(LVT).T: /GAGG/ vs /[GAG]G/
176 ok($Collator->gt("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}"));
177 ok($hangul  ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}"));
178 ok($trailwt ->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC01}\x{11A8}"));
179
180 # LLVT vs L.Syl(LVT): /GGAG/ vs /G[GAG]/
181 ok($Collator->gt("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}"));
182 ok($hangul  ->eq("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}"));
183 ok($trailwt ->eq("\x{1101}\x{1161}\x{11A8}", "\x{1100}\x{AC01}"));
184
185 #########################
186
187 # checks contraction in LVT:
188 # weights of these contractions may be non-sense.
189
190 my $hangcont = Unicode::Collate->new(
191   level => 3,
192   table => undef,
193   normalization => undef,
194   entry => <<'ENTRIES',
195 1100  ; [.1831.0020.0002] # HANGUL CHOSEONG KIYEOK
196 1101  ; [.1832.0020.0002] # HANGUL CHOSEONG SSANGKIYEOK
197 1161  ; [.188D.0020.0002] # HANGUL JUNGSEONG A
198 1162  ; [.188E.0020.0002] # HANGUL JUNGSEONG AE
199 1163  ; [.188F.0020.0002] # HANGUL JUNGSEONG YA
200 11A8  ; [.18CF.0020.0002] # HANGUL JONGSEONG KIYEOK
201 11A9  ; [.18D0.0020.0002] # HANGUL JONGSEONG SSANGKIYEOK
202 1161 11A9 ; [.0000.0000.0000] # A-GG <contraction>
203 1100 1163 11A8 ; [.1000.0020.0002] # G-YA-G <contraction> eq. U+AC39
204 ENTRIES
205 );
206
207 # contracted into VT
208 ok($Collator->lt("\x{1101}", "\x{1101}\x{1161}\x{11A9}"));
209 ok($hangcont->eq("\x{1101}", "\x{1101}\x{1161}\x{11A9}"));
210
211 # not contracted into LVT but into VT
212 ok($Collator->lt("\x{1100}", "\x{1100}\x{1161}\x{11A9}"));
213 ok($hangcont->eq("\x{1100}", "\x{1100}\x{1161}\x{11A9}"));
214
215 # contracted into LVT
216 ok($Collator->gt("\x{1100}\x{1163}\x{11A8}", "\x{1100}"));
217 ok($hangcont->lt("\x{1100}\x{1163}\x{11A8}", "\x{1100}"));
218
219 # LVTT vs Syl(LVTT): /GAGG/ vs /[GAGG]/
220 ok($Collator->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
221 ok($hangcont->eq("\x{1100}\x{1161}\x{11A9}", "\x{AC02}"));
222
223 # LVT vs Syl(LVT): /GYAG/ vs /[GYAG]/
224 ok($Collator->eq("\x{1100}\x{1163}\x{11A8}", "\x{AC39}"));
225 ok($hangcont->eq("\x{1100}\x{1163}\x{11A8}", "\x{AC39}"));
226
227 1;
228 __END__