Commit | Line | Data |
---|---|---|
ac5ea531 JH |
1 | package Unicode::Normalize; |
2 | ||
4a2e806c | 3 | BEGIN { |
65e3945f | 4 | unless ('A' eq pack('U', 0x41)) { |
9f1f04a1 | 5 | die "Unicode::Normalize cannot stringify a Unicode code point\n"; |
4a2e806c | 6 | } |
65e3945f CBW |
7 | unless (0x41 == unpack('U', 'A')) { |
8 | die "Unicode::Normalize cannot get Unicode code point\n"; | |
9 | } | |
4a2e806c JH |
10 | } |
11 | ||
ac5ea531 JH |
12 | use 5.006; |
13 | use strict; | |
14 | use warnings; | |
15 | use Carp; | |
16 | ||
e524f5b2 NC |
17 | no warnings 'utf8'; |
18 | ||
95f3e8d2 | 19 | our $VERSION = '1.18'; |
ac5ea531 JH |
20 | our $PACKAGE = __PACKAGE__; |
21 | ||
ac5ea531 | 22 | our @EXPORT = qw( NFC NFD NFKC NFKD ); |
2a204b45 JH |
23 | our @EXPORT_OK = qw( |
24 | normalize decompose reorder compose | |
8f118dcd JH |
25 | checkNFD checkNFKD checkNFC checkNFKC check |
26 | getCanon getCompat getComposite getCombinClass | |
27 | isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex | |
28 | isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE | |
a96160d6 CBW |
29 | FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter |
30 | normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial | |
8f118dcd JH |
31 | ); |
32 | our %EXPORT_TAGS = ( | |
33 | all => [ @EXPORT, @EXPORT_OK ], | |
34 | normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], | |
35 | check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], | |
82e740b6 | 36 | fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], |
2a204b45 | 37 | ); |
ac5ea531 | 38 | |
fe067ad9 | 39 | ## |
788ba0f4 | 40 | ## utilities for tests |
fe067ad9 SP |
41 | ## |
42 | ||
9f1f04a1 | 43 | sub pack_U { |
b8d10bc1 | 44 | return pack('U*', @_); |
9f1f04a1 RGS |
45 | } |
46 | ||
47 | sub unpack_U { | |
fe067ad9 | 48 | return unpack('U*', shift(@_).pack('U*')); |
9f1f04a1 RGS |
49 | } |
50 | ||
a96160d6 CBW |
51 | require Exporter; |
52 | ||
95f3e8d2 SH |
53 | our @ISA = qw(Exporter); |
54 | use File::Spec; | |
55 | ||
56 | our %Combin; # $codepoint => $number : combination class | |
57 | our %Canon; # $codepoint => \@codepoints : canonical decomp. | |
58 | our %Compat; # $codepoint => \@codepoints : compat. decomp. | |
59 | our %Compos; # $1st,$2nd => $codepoint : composite | |
60 | our %Exclus; # $codepoint => 1 : composition exclusions | |
61 | our %Single; # $codepoint => 1 : singletons | |
62 | our %NonStD; # $codepoint => 1 : non-starter decompositions | |
63 | our %Comp2nd; # $codepoint => 1 : may be composed with a prev char. | |
64 | ||
65 | # from core Unicode database | |
66 | our $Combin = do "unicore/CombiningClass.pl" | |
67 | || do "unicode/CombiningClass.pl" | |
68 | || croak "$PACKAGE: CombiningClass.pl not found"; | |
69 | our $Decomp = do "unicore/Decomposition.pl" | |
70 | || do "unicode/Decomposition.pl" | |
71 | || croak "$PACKAGE: Decomposition.pl not found"; | |
72 | ||
73 | # CompositionExclusions.txt since Unicode 3.2.0 | |
74 | our @CompEx = qw( | |
75 | 0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36 | |
76 | 0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76 | |
77 | 0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D | |
78 | FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B | |
79 | FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C | |
80 | FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB | |
81 | 1D1BC 1D1BD 1D1BE 1D1BF 1D1C0 | |
82 | ); | |
83 | ||
84 | # definition of Hangul constants | |
85 | use constant SBase => 0xAC00; | |
86 | use constant SFinal => 0xD7A3; # SBase -1 + SCount | |
87 | use constant SCount => 11172; # LCount * NCount | |
88 | use constant NCount => 588; # VCount * TCount | |
89 | use constant LBase => 0x1100; | |
90 | use constant LFinal => 0x1112; | |
91 | use constant LCount => 19; | |
92 | use constant VBase => 0x1161; | |
93 | use constant VFinal => 0x1175; | |
94 | use constant VCount => 21; | |
95 | use constant TBase => 0x11A7; | |
96 | use constant TFinal => 0x11C2; | |
97 | use constant TCount => 28; | |
98 | ||
99 | sub decomposeHangul { | |
100 | my $sindex = $_[0] - SBase; | |
101 | my $lindex = int( $sindex / NCount); | |
102 | my $vindex = int(($sindex % NCount) / TCount); | |
103 | my $tindex = $sindex % TCount; | |
104 | my @ret = ( | |
105 | LBase + $lindex, | |
106 | VBase + $vindex, | |
107 | $tindex ? (TBase + $tindex) : (), | |
108 | ); | |
109 | return wantarray ? @ret : pack_U(@ret); | |
110 | } | |
111 | ||
112 | ########## getting full decomposition ########## | |
113 | ||
114 | ## converts string "hhhh hhhh hhhh" to a numeric list | |
115 | ## (hex digits separated by spaces) | |
116 | sub _getHexArray { map hex, $_[0] =~ /\G *([0-9A-Fa-f]+)/g } | |
117 | ||
118 | while ($Combin =~ /(.+)/g) { | |
119 | my @tab = split /\t/, $1; | |
120 | my $ini = hex $tab[0]; | |
121 | if ($tab[1] eq '') { | |
122 | $Combin{$ini} = $tab[2]; | |
123 | } else { | |
124 | $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]); | |
125 | } | |
126 | } | |
127 | ||
128 | while ($Decomp =~ /(.+)/g) { | |
129 | my @tab = split /\t/, $1; | |
130 | my $compat = $tab[2] =~ s/<[^>]+>//; | |
131 | my $dec = [ _getHexArray($tab[2]) ]; # decomposition | |
132 | my $ini = hex($tab[0]); # initial decomposable character | |
133 | my $end = $tab[1] eq '' ? $ini : hex($tab[1]); | |
134 | # ($ini .. $end) is the range of decomposable characters. | |
135 | ||
136 | foreach my $u ($ini .. $end) { | |
137 | $Compat{$u} = $dec; | |
138 | $Canon{$u} = $dec if ! $compat; | |
139 | } | |
140 | } | |
141 | ||
142 | for my $s (@CompEx) { | |
143 | my $u = hex $s; | |
144 | next if !$Canon{$u}; # not assigned | |
145 | next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2 | |
146 | $Exclus{$u} = 1; | |
147 | } | |
148 | ||
149 | foreach my $u (keys %Canon) { | |
150 | my $dec = $Canon{$u}; | |
151 | ||
152 | if (@$dec == 2) { | |
153 | if ($Combin{ $dec->[0] }) { | |
154 | $NonStD{$u} = 1; | |
155 | } else { | |
156 | $Compos{ $dec->[0] }{ $dec->[1] } = $u; | |
157 | $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; | |
158 | } | |
159 | } elsif (@$dec == 1) { | |
160 | $Single{$u} = 1; | |
161 | } else { | |
162 | my $h = sprintf '%04X', $u; | |
163 | croak("Weird Canonical Decomposition of U+$h"); | |
164 | } | |
165 | } | |
166 | ||
167 | # modern HANGUL JUNGSEONG and HANGUL JONGSEONG jamo | |
168 | foreach my $j (0x1161..0x1175, 0x11A8..0x11C2) { | |
169 | $Comp2nd{$j} = 1; | |
170 | } | |
171 | ||
172 | sub getCanonList { | |
173 | my @src = @_; | |
174 | my @dec = map { | |
175 | (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) | |
176 | : $Canon{$_} ? @{ $Canon{$_} } : $_ | |
177 | } @src; | |
178 | return join(" ",@src) eq join(" ",@dec) ? @dec : getCanonList(@dec); | |
179 | # condition @src == @dec is not ok. | |
180 | } | |
181 | ||
182 | sub getCompatList { | |
183 | my @src = @_; | |
184 | my @dec = map { | |
185 | (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) | |
186 | : $Compat{$_} ? @{ $Compat{$_} } : $_ | |
187 | } @src; | |
188 | return join(" ",@src) eq join(" ",@dec) ? @dec : getCompatList(@dec); | |
189 | # condition @src == @dec is not ok. | |
190 | } | |
191 | ||
192 | # exhaustive decomposition | |
193 | foreach my $key (keys %Canon) { | |
194 | $Canon{$key} = [ getCanonList($key) ]; | |
195 | } | |
196 | ||
197 | # exhaustive decomposition | |
198 | foreach my $key (keys %Compat) { | |
199 | $Compat{$key} = [ getCompatList($key) ]; | |
200 | } | |
201 | ||
202 | sub getHangulComposite ($$) { | |
203 | if ((LBase <= $_[0] && $_[0] <= LFinal) | |
204 | && (VBase <= $_[1] && $_[1] <= VFinal)) { | |
205 | my $lindex = $_[0] - LBase; | |
206 | my $vindex = $_[1] - VBase; | |
207 | return (SBase + ($lindex * VCount + $vindex) * TCount); | |
208 | } | |
209 | if ((SBase <= $_[0] && $_[0] <= SFinal && (($_[0] - SBase ) % TCount) == 0) | |
210 | && (TBase < $_[1] && $_[1] <= TFinal)) { | |
211 | return($_[0] + $_[1] - TBase); | |
212 | } | |
213 | return undef; | |
214 | } | |
215 | ||
216 | ########## | |
217 | ||
218 | sub getCombinClass ($) { | |
219 | my $uv = 0 + shift; | |
220 | return $Combin{$uv} || 0; | |
221 | } | |
222 | ||
223 | sub getCanon ($) { | |
224 | my $uv = 0 + shift; | |
225 | return exists $Canon{$uv} | |
226 | ? pack_U(@{ $Canon{$uv} }) | |
227 | : (SBase <= $uv && $uv <= SFinal) | |
228 | ? scalar decomposeHangul($uv) | |
229 | : undef; | |
230 | } | |
231 | ||
232 | sub getCompat ($) { | |
233 | my $uv = 0 + shift; | |
234 | return exists $Compat{$uv} | |
235 | ? pack_U(@{ $Compat{$uv} }) | |
236 | : (SBase <= $uv && $uv <= SFinal) | |
237 | ? scalar decomposeHangul($uv) | |
238 | : undef; | |
239 | } | |
240 | ||
241 | sub getComposite ($$) { | |
242 | my $uv1 = 0 + shift; | |
243 | my $uv2 = 0 + shift; | |
244 | my $hangul = getHangulComposite($uv1, $uv2); | |
245 | return $hangul if $hangul; | |
246 | return $Compos{ $uv1 } && $Compos{ $uv1 }{ $uv2 }; | |
247 | } | |
248 | ||
249 | sub isExclusion ($) { | |
250 | my $uv = 0 + shift; | |
251 | return exists $Exclus{$uv}; | |
252 | } | |
253 | ||
254 | sub isSingleton ($) { | |
255 | my $uv = 0 + shift; | |
256 | return exists $Single{$uv}; | |
257 | } | |
258 | ||
259 | sub isNonStDecomp($) { | |
260 | my $uv = 0 + shift; | |
261 | return exists $NonStD{$uv}; | |
262 | } | |
263 | ||
264 | sub isComp2nd ($) { | |
265 | my $uv = 0 + shift; | |
266 | return exists $Comp2nd{$uv}; | |
267 | } | |
268 | ||
269 | sub isNFC_MAYBE ($) { | |
270 | my $uv = 0 + shift; | |
271 | return exists $Comp2nd{$uv}; | |
272 | } | |
273 | ||
274 | sub isNFKC_MAYBE($) { | |
275 | my $uv = 0 + shift; | |
276 | return exists $Comp2nd{$uv}; | |
277 | } | |
278 | ||
279 | sub isNFD_NO ($) { | |
280 | my $uv = 0 + shift; | |
281 | return exists $Canon {$uv} || (SBase <= $uv && $uv <= SFinal); | |
282 | } | |
283 | ||
284 | sub isNFKD_NO ($) { | |
285 | my $uv = 0 + shift; | |
286 | return exists $Compat{$uv} || (SBase <= $uv && $uv <= SFinal); | |
287 | } | |
288 | ||
289 | sub isComp_Ex ($) { | |
290 | my $uv = 0 + shift; | |
291 | return exists $Exclus{$uv} || exists $Single{$uv} || exists $NonStD{$uv}; | |
292 | } | |
293 | ||
294 | sub isNFC_NO ($) { | |
295 | my $uv = 0 + shift; | |
296 | return exists $Exclus{$uv} || exists $Single{$uv} || exists $NonStD{$uv}; | |
297 | } | |
298 | ||
299 | sub isNFKC_NO ($) { | |
300 | my $uv = 0 + shift; | |
301 | return 1 if $Exclus{$uv} || $Single{$uv} || $NonStD{$uv}; | |
302 | return '' if (SBase <= $uv && $uv <= SFinal) || !exists $Compat{$uv}; | |
303 | return 1 if ! exists $Canon{$uv}; | |
304 | return pack('N*', @{ $Canon{$uv} }) ne pack('N*', @{ $Compat{$uv} }); | |
305 | } | |
306 | ||
307 | ## | |
308 | ## string decompose(string, compat?) | |
309 | ## | |
310 | sub decompose ($;$) | |
311 | { | |
312 | my $hash = $_[1] ? \%Compat : \%Canon; | |
313 | return pack_U map { | |
314 | $hash->{ $_ } ? @{ $hash->{ $_ } } : | |
315 | (SBase <= $_ && $_ <= SFinal) ? decomposeHangul($_) : $_ | |
316 | } unpack_U($_[0]); | |
317 | } | |
318 | ||
319 | ## | |
320 | ## string reorder(string) | |
321 | ## | |
322 | sub reorder ($) | |
323 | { | |
324 | my @src = unpack_U($_[0]); | |
325 | ||
326 | for (my $i=0; $i < @src;) { | |
327 | $i++, next if ! $Combin{ $src[$i] }; | |
a96160d6 | 328 | |
95f3e8d2 SH |
329 | my $ini = $i; |
330 | $i++ while $i < @src && $Combin{ $src[$i] }; | |
a96160d6 | 331 | |
95f3e8d2 SH |
332 | my @tmp = sort { |
333 | $Combin{ $src[$a] } <=> $Combin{ $src[$b] } || $a <=> $b | |
334 | } $ini .. $i - 1; | |
335 | ||
336 | @src[ $ini .. $i - 1 ] = @src[ @tmp ]; | |
337 | } | |
338 | return pack_U(@src); | |
339 | } | |
340 | ||
341 | ||
342 | ## | |
343 | ## string compose(string) | |
344 | ## | |
345 | ## S : starter; NS : not starter; | |
346 | ## | |
347 | ## composable sequence begins at S. | |
348 | ## S + S or (S + S) + S may be composed. | |
349 | ## NS + NS must not be composed. | |
350 | ## | |
351 | sub compose ($) | |
352 | { | |
353 | my @src = unpack_U($_[0]); | |
354 | ||
355 | for (my $s = 0; $s+1 < @src; $s++) { | |
356 | next unless defined $src[$s] && ! $Combin{ $src[$s] }; | |
357 | # S only; removed or combining are skipped as a starter. | |
358 | ||
359 | my($c, $blocked, $uncomposed_cc); | |
360 | for (my $j = $s+1; $j < @src && !$blocked; $j++) { | |
361 | ($Combin{ $src[$j] } ? $uncomposed_cc : $blocked) = 1; | |
362 | ||
363 | # S + C + S => S-S + C would be blocked. | |
364 | next if $blocked && $uncomposed_cc; | |
365 | ||
366 | # blocked by same CC (and higher CC: revised D2) | |
367 | next if defined $src[$j-1] && $Combin{ $src[$j-1] } | |
368 | && $Combin{ $src[$j-1] } >= $Combin{ $src[$j] }; | |
369 | ||
370 | $c = getComposite($src[$s], $src[$j]); | |
371 | ||
372 | # no composite or is exclusion | |
373 | next if !$c || $Exclus{$c}; | |
374 | ||
375 | # replace by composite | |
376 | $src[$s] = $c; $src[$j] = undef; | |
377 | if ($blocked) { $blocked = 0 } else { -- $uncomposed_cc } | |
378 | } | |
379 | } | |
380 | return pack_U(grep defined, @src); | |
381 | } | |
382 | ||
383 | ||
384 | ## | |
385 | ## string composeContiguous(string) | |
386 | ## | |
387 | sub composeContiguous ($) | |
388 | { | |
389 | my @src = unpack_U($_[0]); | |
390 | ||
391 | for (my $s = 0; $s+1 < @src; $s++) { | |
392 | next unless defined $src[$s] && ! $Combin{ $src[$s] }; | |
393 | # S only; removed or combining are skipped as a starter. | |
394 | ||
395 | for (my $j = $s+1; $j < @src; $j++) { | |
396 | my $c = getComposite($src[$s], $src[$j]); | |
397 | ||
398 | # no composite or is exclusion | |
399 | last if !$c || $Exclus{$c}; | |
400 | ||
401 | # replace by composite | |
402 | $src[$s] = $c; $src[$j] = undef; | |
403 | } | |
404 | } | |
405 | return pack_U(grep defined, @src); | |
406 | } | |
407 | ||
408 | ||
409 | ## | |
410 | ## normalization forms | |
411 | ## | |
412 | ||
413 | use constant COMPAT => 1; | |
414 | ||
415 | sub NFD ($) { reorder(decompose($_[0])) } | |
416 | sub NFKD ($) { reorder(decompose($_[0], COMPAT)) } | |
417 | sub NFC ($) { compose(reorder(decompose($_[0]))) } | |
418 | sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) } | |
419 | sub FCC ($) { composeContiguous(reorder(decompose($_[0]))) } | |
420 | ||
421 | ## | |
422 | ## quick check | |
423 | ## | |
424 | ||
425 | sub checkNFD ($) | |
426 | { | |
427 | my $preCC = 0; | |
428 | my $curCC; | |
429 | for my $uv (unpack_U($_[0])) { | |
430 | $curCC = $Combin{ $uv } || 0; | |
431 | return '' if $preCC > $curCC && $curCC != 0; | |
432 | return '' if exists $Canon{$uv} || (SBase <= $uv && $uv <= SFinal); | |
433 | $preCC = $curCC; | |
434 | } | |
435 | return 1; | |
436 | } | |
437 | ||
438 | sub checkNFKD ($) | |
439 | { | |
440 | my $preCC = 0; | |
441 | my $curCC; | |
442 | for my $uv (unpack_U($_[0])) { | |
443 | $curCC = $Combin{ $uv } || 0; | |
444 | return '' if $preCC > $curCC && $curCC != 0; | |
445 | return '' if exists $Compat{$uv} || (SBase <= $uv && $uv <= SFinal); | |
446 | $preCC = $curCC; | |
447 | } | |
448 | return 1; | |
449 | } | |
450 | ||
451 | sub checkNFC ($) | |
452 | { | |
453 | my $preCC = 0; | |
454 | my($curCC, $isMAYBE); | |
455 | for my $uv (unpack_U($_[0])) { | |
456 | $curCC = $Combin{ $uv } || 0; | |
457 | return '' if $preCC > $curCC && $curCC != 0; | |
458 | ||
459 | if (isNFC_MAYBE($uv)) { | |
460 | $isMAYBE = 1; | |
461 | } elsif (isNFC_NO($uv)) { | |
462 | return ''; | |
463 | } | |
464 | $preCC = $curCC; | |
465 | } | |
466 | return $isMAYBE ? undef : 1; | |
467 | } | |
468 | ||
469 | sub checkNFKC ($) | |
470 | { | |
471 | my $preCC = 0; | |
472 | my($curCC, $isMAYBE); | |
473 | for my $uv (unpack_U($_[0])) { | |
474 | $curCC = $Combin{ $uv } || 0; | |
475 | return '' if $preCC > $curCC && $curCC != 0; | |
476 | ||
477 | if (isNFKC_MAYBE($uv)) { | |
478 | $isMAYBE = 1; | |
479 | } elsif (isNFKC_NO($uv)) { | |
480 | return ''; | |
481 | } | |
482 | $preCC = $curCC; | |
483 | } | |
484 | return $isMAYBE ? undef : 1; | |
485 | } | |
486 | ||
487 | sub checkFCD ($) | |
488 | { | |
489 | my $preCC = 0; | |
490 | my $curCC; | |
491 | for my $uv (unpack_U($_[0])) { | |
492 | # Hangul syllable need not decomposed since cc[any Jamo] == 0; | |
493 | my @uvCan = exists $Canon{$uv} ? @{ $Canon{$uv} } : ($uv); | |
494 | ||
495 | $curCC = $Combin{ $uvCan[0] } || 0; | |
496 | return '' if $curCC != 0 && $curCC < $preCC; | |
497 | $preCC = $Combin{ $uvCan[-1] } || 0; | |
498 | } | |
499 | return 1; | |
500 | } | |
501 | ||
502 | sub checkFCC ($) | |
503 | { | |
504 | my $preCC = 0; | |
505 | my($curCC, $isMAYBE); | |
506 | for my $uv (unpack_U($_[0])) { | |
507 | # Hangul syllable need not decomposed since cc[any Jamo] == 0; | |
508 | my @uvCan = exists $Canon{$uv} ? @{ $Canon{$uv} } : ($uv); | |
509 | ||
510 | $curCC = $Combin{ $uvCan[0] } || 0; | |
511 | return '' if $curCC != 0 && $curCC < $preCC; | |
512 | ||
513 | if (isNFC_MAYBE($uv)) { | |
514 | $isMAYBE = 1; | |
515 | } elsif (isNFC_NO($uv)) { | |
516 | return ''; | |
517 | } | |
518 | ||
519 | $preCC = $Combin{ $uvCan[-1] } || 0; | |
520 | } | |
521 | return $isMAYBE ? undef : 1; | |
522 | } | |
523 | ||
524 | ## | |
525 | ## split on last starter | |
526 | ## | |
527 | ||
528 | sub splitOnLastStarter | |
529 | { | |
530 | my $str = pack_U(unpack_U(shift)); | |
531 | if ($str eq '') { | |
532 | return ('', ''); | |
533 | } | |
534 | ||
535 | my $ch; | |
536 | my $unproc = ""; | |
537 | do { | |
538 | $ch = chop($str); | |
539 | $unproc = $ch.$unproc; | |
540 | } while (getCombinClass(unpack 'U', $ch) && $str ne ""); | |
541 | return ($str, $unproc); | |
542 | } | |
82e740b6 NC |
543 | |
544 | ## | |
a96160d6 | 545 | ## normalize |
82e740b6 NC |
546 | ## |
547 | ||
82e740b6 NC |
548 | sub FCD ($) { |
549 | my $str = shift; | |
550 | return checkFCD($str) ? $str : NFD($str); | |
551 | } | |
82e740b6 NC |
552 | |
553 | our %formNorm = ( | |
554 | NFC => \&NFC, C => \&NFC, | |
555 | NFD => \&NFD, D => \&NFD, | |
556 | NFKC => \&NFKC, KC => \&NFKC, | |
557 | NFKD => \&NFKD, KD => \&NFKD, | |
558 | FCD => \&FCD, FCC => \&FCC, | |
559 | ); | |
560 | ||
ac5ea531 JH |
561 | sub normalize($$) |
562 | { | |
d85850a7 | 563 | my $form = shift; |
f027f502 | 564 | my $str = shift; |
fe067ad9 SP |
565 | if (exists $formNorm{$form}) { |
566 | return $formNorm{$form}->($str); | |
567 | } | |
568 | croak($PACKAGE."::normalize: invalid form name: $form"); | |
ac5ea531 JH |
569 | } |
570 | ||
a96160d6 CBW |
571 | ## |
572 | ## partial | |
573 | ## | |
574 | ||
575 | sub normalize_partial ($$) { | |
576 | if (exists $formNorm{$_[0]}) { | |
577 | my $n = normalize($_[0], $_[1]); | |
578 | my($p, $u) = splitOnLastStarter($n); | |
579 | $_[1] = $u; | |
580 | return $p; | |
581 | } | |
582 | croak($PACKAGE."::normalize_partial: invalid form name: $_[0]"); | |
583 | } | |
584 | ||
585 | sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) } | |
586 | sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) } | |
587 | sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) } | |
588 | sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) } | |
82e740b6 NC |
589 | |
590 | ## | |
a96160d6 | 591 | ## check |
82e740b6 NC |
592 | ## |
593 | ||
594 | our %formCheck = ( | |
595 | NFC => \&checkNFC, C => \&checkNFC, | |
596 | NFD => \&checkNFD, D => \&checkNFD, | |
597 | NFKC => \&checkNFKC, KC => \&checkNFKC, | |
598 | NFKD => \&checkNFKD, KD => \&checkNFKD, | |
599 | FCD => \&checkFCD, FCC => \&checkFCC, | |
600 | ); | |
601 | ||
8f118dcd JH |
602 | sub check($$) |
603 | { | |
604 | my $form = shift; | |
f027f502 | 605 | my $str = shift; |
fe067ad9 SP |
606 | if (exists $formCheck{$form}) { |
607 | return $formCheck{$form}->($str); | |
608 | } | |
609 | croak($PACKAGE."::check: invalid form name: $form"); | |
8f118dcd JH |
610 | } |
611 | ||
ac5ea531 JH |
612 | 1; |
613 | __END__ | |
2a204b45 JH |
614 | |
615 | =head1 NAME | |
616 | ||
f027f502 | 617 | Unicode::Normalize - Unicode Normalization Forms |
2a204b45 JH |
618 | |
619 | =head1 SYNOPSIS | |
620 | ||
a092bcfd RGS |
621 | (1) using function names exported by default: |
622 | ||
2a204b45 JH |
623 | use Unicode::Normalize; |
624 | ||
8f118dcd JH |
625 | $NFD_string = NFD($string); # Normalization Form D |
626 | $NFC_string = NFC($string); # Normalization Form C | |
627 | $NFKD_string = NFKD($string); # Normalization Form KD | |
628 | $NFKC_string = NFKC($string); # Normalization Form KC | |
2a204b45 | 629 | |
a092bcfd | 630 | (2) using function names exported on request: |
2a204b45 JH |
631 | |
632 | use Unicode::Normalize 'normalize'; | |
633 | ||
8f118dcd JH |
634 | $NFD_string = normalize('D', $string); # Normalization Form D |
635 | $NFC_string = normalize('C', $string); # Normalization Form C | |
636 | $NFKD_string = normalize('KD', $string); # Normalization Form KD | |
637 | $NFKC_string = normalize('KC', $string); # Normalization Form KC | |
2a204b45 JH |
638 | |
639 | =head1 DESCRIPTION | |
640 | ||
00f2676f JH |
641 | Parameters: |
642 | ||
fe067ad9 | 643 | C<$string> is used as a string under character semantics (see F<perlunicode>). |
00f2676f | 644 | |
fe067ad9 | 645 | C<$code_point> should be an unsigned integer representing a Unicode code point. |
00f2676f | 646 | |
95f3e8d2 | 647 | Note: Do not use a floating point nor a negative sign in C<$code_point>. |
00f2676f | 648 | |
d85850a7 | 649 | =head2 Normalization Forms |
2a204b45 JH |
650 | |
651 | =over 4 | |
652 | ||
8f118dcd | 653 | =item C<$NFD_string = NFD($string)> |
2a204b45 | 654 | |
fe067ad9 | 655 | It returns the Normalization Form D (formed by canonical decomposition). |
2a204b45 | 656 | |
8f118dcd | 657 | =item C<$NFC_string = NFC($string)> |
2a204b45 | 658 | |
fe067ad9 | 659 | It returns the Normalization Form C (formed by canonical decomposition |
2a204b45 JH |
660 | followed by canonical composition). |
661 | ||
8f118dcd | 662 | =item C<$NFKD_string = NFKD($string)> |
2a204b45 | 663 | |
fe067ad9 | 664 | It returns the Normalization Form KD (formed by compatibility decomposition). |
2a204b45 | 665 | |
8f118dcd | 666 | =item C<$NFKC_string = NFKC($string)> |
2a204b45 | 667 | |
fe067ad9 | 668 | It returns the Normalization Form KC (formed by compatibility decomposition |
2a204b45 JH |
669 | followed by B<canonical> composition). |
670 | ||
82e740b6 NC |
671 | =item C<$FCD_string = FCD($string)> |
672 | ||
673 | If the given string is in FCD ("Fast C or D" form; cf. UTN #5), | |
fe067ad9 | 674 | it returns the string without modification; otherwise it returns an FCD string. |
82e740b6 NC |
675 | |
676 | Note: FCD is not always unique, then plural forms may be equivalent | |
677 | each other. C<FCD()> will return one of these equivalent forms. | |
678 | ||
679 | =item C<$FCC_string = FCC($string)> | |
680 | ||
fe067ad9 | 681 | It returns the FCC form ("Fast C Contiguous"; cf. UTN #5). |
82e740b6 | 682 | |
e524f5b2 | 683 | Note: FCC is unique, as well as four normalization forms (NF*). |
82e740b6 | 684 | |
8f118dcd | 685 | =item C<$normalized_string = normalize($form_name, $string)> |
2a204b45 | 686 | |
fe067ad9 SP |
687 | It returns the normalization form of C<$form_name>. |
688 | ||
2a204b45 JH |
689 | As C<$form_name>, one of the following names must be given. |
690 | ||
82e740b6 NC |
691 | 'C' or 'NFC' for Normalization Form C (UAX #15) |
692 | 'D' or 'NFD' for Normalization Form D (UAX #15) | |
693 | 'KC' or 'NFKC' for Normalization Form KC (UAX #15) | |
694 | 'KD' or 'NFKD' for Normalization Form KD (UAX #15) | |
695 | ||
696 | 'FCD' for "Fast C or D" Form (UTN #5) | |
697 | 'FCC' for "Fast C Contiguous" (UTN #5) | |
2a204b45 JH |
698 | |
699 | =back | |
700 | ||
8f118dcd JH |
701 | =head2 Decomposition and Composition |
702 | ||
703 | =over 4 | |
704 | ||
fe067ad9 | 705 | =item C<$decomposed_string = decompose($string [, $useCompatMapping])> |
8f118dcd | 706 | |
fe067ad9 SP |
707 | It returns the concatenation of the decomposition of each character |
708 | in the string. | |
8f118dcd | 709 | |
fe067ad9 SP |
710 | If the second parameter (a boolean) is omitted or false, |
711 | the decomposition is canonical decomposition; | |
712 | if the second parameter (a boolean) is true, | |
713 | the decomposition is compatibility decomposition. | |
8f118dcd | 714 | |
fe067ad9 | 715 | The string returned is not always in NFD/NFKD. Reordering may be required. |
8f118dcd JH |
716 | |
717 | $NFD_string = reorder(decompose($string)); # eq. to NFD() | |
718 | $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() | |
719 | ||
fe067ad9 | 720 | =item C<$reordered_string = reorder($string)> |
8f118dcd | 721 | |
fe067ad9 SP |
722 | It returns the result of reordering the combining characters |
723 | according to Canonical Ordering Behavior. | |
8f118dcd | 724 | |
fe067ad9 SP |
725 | For example, when you have a list of NFD/NFKD strings, |
726 | you can get the concatenated NFD/NFKD string from them, by saying | |
8f118dcd JH |
727 | |
728 | $concat_NFD = reorder(join '', @NFD_strings); | |
729 | $concat_NFKD = reorder(join '', @NFKD_strings); | |
730 | ||
fe067ad9 | 731 | =item C<$composed_string = compose($string)> |
8f118dcd | 732 | |
fe067ad9 SP |
733 | It returns the result of canonical composition |
734 | without applying any decomposition. | |
8f118dcd | 735 | |
fe067ad9 SP |
736 | For example, when you have a NFD/NFKD string, |
737 | you can get its NFC/NFKC string, by saying | |
8f118dcd JH |
738 | |
739 | $NFC_string = compose($NFD_string); | |
740 | $NFKC_string = compose($NFKD_string); | |
741 | ||
a96160d6 CBW |
742 | =item C<($processed, $unprocessed) = splitOnLastStarter($normalized)> |
743 | ||
744 | It returns two strings: the first one, C<$processed>, is a part | |
745 | before the last starter, and the second one, C<$unprocessed> is | |
746 | another part after the first part. A starter is a character having | |
747 | a combining class of zero (see UAX #15). | |
748 | ||
749 | Note that C<$processed> may be empty (when C<$normalized> contains no | |
750 | starter or starts with the last starter), and then C<$unprocessed> | |
751 | should be equal to the entire C<$normalized>. | |
752 | ||
753 | When you have a C<$normalized> string and an C<$unnormalized> string | |
754 | following it, a simple concatenation is wrong: | |
755 | ||
756 | $concat = $normalized . normalize($form, $unnormalized); # wrong! | |
757 | ||
758 | Instead of it, do like this: | |
759 | ||
760 | ($processed, $unprocessed) = splitOnLastStarter($normalized); | |
761 | $concat = $processed . normalize($form, $unprocessed.$unnormalized); | |
762 | ||
763 | C<splitOnLastStarter()> should be called with a pre-normalized parameter | |
764 | C<$normalized>, that is in the same form as C<$form> you want. | |
765 | ||
766 | If you have an array of C<@string> that should be concatenated and then | |
767 | normalized, you can do like this: | |
768 | ||
769 | my $result = ""; | |
770 | my $unproc = ""; | |
771 | foreach my $str (@string) { | |
772 | $unproc .= $str; | |
773 | my $n = normalize($form, $unproc); | |
774 | my($p, $u) = splitOnLastStarter($n); | |
775 | $result .= $p; | |
776 | $unproc = $u; | |
777 | } | |
778 | $result .= $unproc; | |
779 | # instead of normalize($form, join('', @string)) | |
780 | ||
781 | =item C<$processed = normalize_partial($form, $unprocessed)> | |
782 | ||
783 | A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>. | |
784 | Note that C<$unprocessed> will be modified as a side-effect. | |
785 | ||
786 | If you have an array of C<@string> that should be concatenated and then | |
787 | normalized, you can do like this: | |
788 | ||
789 | my $result = ""; | |
790 | my $unproc = ""; | |
791 | foreach my $str (@string) { | |
792 | $unproc .= $str; | |
793 | $result .= normalize_partial($form, $unproc); | |
794 | } | |
795 | $result .= $unproc; | |
796 | # instead of normalize($form, join('', @string)) | |
797 | ||
798 | =item C<$processed = NFD_partial($unprocessed)> | |
799 | ||
800 | It does like C<normalize_partial('NFD', $unprocessed)>. | |
801 | Note that C<$unprocessed> will be modified as a side-effect. | |
802 | ||
803 | =item C<$processed = NFC_partial($unprocessed)> | |
804 | ||
805 | It does like C<normalize_partial('NFC', $unprocessed)>. | |
806 | Note that C<$unprocessed> will be modified as a side-effect. | |
807 | ||
808 | =item C<$processed = NFKD_partial($unprocessed)> | |
809 | ||
810 | It does like C<normalize_partial('NFKD', $unprocessed)>. | |
811 | Note that C<$unprocessed> will be modified as a side-effect. | |
812 | ||
813 | =item C<$processed = NFKC_partial($unprocessed)> | |
814 | ||
815 | It does like C<normalize_partial('NFKC', $unprocessed)>. | |
816 | Note that C<$unprocessed> will be modified as a side-effect. | |
817 | ||
8f118dcd JH |
818 | =back |
819 | ||
820 | =head2 Quick Check | |
821 | ||
82e740b6 | 822 | (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>) |
8f118dcd JH |
823 | |
824 | The following functions check whether the string is in that normalization form. | |
825 | ||
fe067ad9 | 826 | The result returned will be one of the following: |
8f118dcd JH |
827 | |
828 | YES The string is in that normalization form. | |
829 | NO The string is not in that normalization form. | |
830 | MAYBE Dubious. Maybe yes, maybe no. | |
831 | ||
832 | =over 4 | |
833 | ||
834 | =item C<$result = checkNFD($string)> | |
835 | ||
fe067ad9 | 836 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. |
8f118dcd JH |
837 | |
838 | =item C<$result = checkNFC($string)> | |
839 | ||
fe067ad9 | 840 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
628bbff0 | 841 | C<undef> if C<MAYBE>. |
8f118dcd JH |
842 | |
843 | =item C<$result = checkNFKD($string)> | |
844 | ||
fe067ad9 | 845 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. |
8f118dcd JH |
846 | |
847 | =item C<$result = checkNFKC($string)> | |
848 | ||
fe067ad9 | 849 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
628bbff0 | 850 | C<undef> if C<MAYBE>. |
8f118dcd | 851 | |
82e740b6 NC |
852 | =item C<$result = checkFCD($string)> |
853 | ||
fe067ad9 | 854 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. |
82e740b6 NC |
855 | |
856 | =item C<$result = checkFCC($string)> | |
857 | ||
fe067ad9 | 858 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
628bbff0 | 859 | C<undef> if C<MAYBE>. |
82e740b6 | 860 | |
fe067ad9 | 861 | Note: If a string is not in FCD, it must not be in FCC. |
82e740b6 NC |
862 | So C<checkFCC($not_FCD_string)> should return C<NO>. |
863 | ||
8f118dcd JH |
864 | =item C<$result = check($form_name, $string)> |
865 | ||
fe067ad9 | 866 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
628bbff0 | 867 | C<undef> if C<MAYBE>. |
8f118dcd | 868 | |
628bbff0 RGS |
869 | As C<$form_name>, one of the following names must be given. |
870 | ||
871 | 'C' or 'NFC' for Normalization Form C (UAX #15) | |
872 | 'D' or 'NFD' for Normalization Form D (UAX #15) | |
873 | 'KC' or 'NFKC' for Normalization Form KC (UAX #15) | |
874 | 'KD' or 'NFKD' for Normalization Form KD (UAX #15) | |
875 | ||
876 | 'FCD' for "Fast C or D" Form (UTN #5) | |
877 | 'FCC' for "Fast C Contiguous" (UTN #5) | |
8f118dcd JH |
878 | |
879 | =back | |
880 | ||
881 | B<Note> | |
882 | ||
82e740b6 NC |
883 | In the cases of NFD, NFKD, and FCD, the answer must be |
884 | either C<YES> or C<NO>. The answer C<MAYBE> may be returned | |
885 | in the cases of NFC, NFKC, and FCC. | |
8f118dcd | 886 | |
82e740b6 NC |
887 | A C<MAYBE> string should contain at least one combining character |
888 | or the like. For example, C<COMBINING ACUTE ACCENT> has | |
8f118dcd | 889 | the MAYBE_NFC/MAYBE_NFKC property. |
82e740b6 | 890 | |
8f118dcd JH |
891 | Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> |
892 | and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. | |
f027f502 | 893 | C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC |
8f118dcd JH |
894 | (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), |
895 | while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. | |
896 | ||
628bbff0 RGS |
897 | If you want to check exactly, compare the string with its NFC/NFKC/FCC. |
898 | ||
899 | if ($string eq NFC($string)) { | |
a96160d6 | 900 | # $string is exactly normalized in NFC; |
628bbff0 | 901 | } else { |
a96160d6 | 902 | # $string is not normalized in NFC; |
628bbff0 | 903 | } |
8f118dcd | 904 | |
628bbff0 | 905 | if ($string eq NFKC($string)) { |
a96160d6 | 906 | # $string is exactly normalized in NFKC; |
628bbff0 | 907 | } else { |
a96160d6 | 908 | # $string is not normalized in NFKC; |
628bbff0 | 909 | } |
8f118dcd | 910 | |
2a204b45 JH |
911 | =head2 Character Data |
912 | ||
913 | These functions are interface of character data used internally. | |
d0ed0342 BG |
914 | If you want only to get Unicode normalization forms, you don't need |
915 | call them yourself. | |
2a204b45 JH |
916 | |
917 | =over 4 | |
918 | ||
fe067ad9 | 919 | =item C<$canonical_decomposition = getCanon($code_point)> |
2a204b45 | 920 | |
fe067ad9 SP |
921 | If the character is canonically decomposable (including Hangul Syllables), |
922 | it returns the (full) canonical decomposition as a string. | |
923 | Otherwise it returns C<undef>. | |
8f118dcd | 924 | |
fe067ad9 SP |
925 | B<Note:> According to the Unicode standard, the canonical decomposition |
926 | of the character that is not canonically decomposable is same as | |
927 | the character itself. | |
8f118dcd | 928 | |
fe067ad9 | 929 | =item C<$compatibility_decomposition = getCompat($code_point)> |
2a204b45 | 930 | |
fe067ad9 SP |
931 | If the character is compatibility decomposable (including Hangul Syllables), |
932 | it returns the (full) compatibility decomposition as a string. | |
933 | Otherwise it returns C<undef>. | |
2a204b45 | 934 | |
fe067ad9 SP |
935 | B<Note:> According to the Unicode standard, the compatibility decomposition |
936 | of the character that is not compatibility decomposable is same as | |
937 | the character itself. | |
2a204b45 | 938 | |
fe067ad9 | 939 | =item C<$code_point_composite = getComposite($code_point_here, $code_point_next)> |
2a204b45 | 940 | |
fe067ad9 | 941 | If two characters here and next (as code points) are composable |
8f118dcd | 942 | (including Hangul Jamo/Syllables and Composition Exclusions), |
fe067ad9 SP |
943 | it returns the code point of the composite. |
944 | ||
945 | If they are not composable, it returns C<undef>. | |
2a204b45 | 946 | |
fe067ad9 | 947 | =item C<$combining_class = getCombinClass($code_point)> |
2a204b45 | 948 | |
fe067ad9 | 949 | It returns the combining class (as an integer) of the character. |
2a204b45 | 950 | |
fe067ad9 | 951 | =item C<$may_be_composed_with_prev_char = isComp2nd($code_point)> |
2a204b45 | 952 | |
fe067ad9 SP |
953 | It returns a boolean whether the character of the specified codepoint |
954 | may be composed with the previous one in a certain composition | |
955 | (including Hangul Compositions, but excluding | |
956 | Composition Exclusions and Non-Starter Decompositions). | |
2a204b45 | 957 | |
fe067ad9 | 958 | =item C<$is_exclusion = isExclusion($code_point)> |
8f118dcd | 959 | |
fe067ad9 | 960 | It returns a boolean whether the code point is a composition exclusion. |
8f118dcd | 961 | |
fe067ad9 | 962 | =item C<$is_singleton = isSingleton($code_point)> |
8f118dcd | 963 | |
fe067ad9 | 964 | It returns a boolean whether the code point is a singleton |
8f118dcd | 965 | |
fe067ad9 | 966 | =item C<$is_non_starter_decomposition = isNonStDecomp($code_point)> |
8f118dcd | 967 | |
fe067ad9 | 968 | It returns a boolean whether the code point has Non-Starter Decomposition. |
8f118dcd | 969 | |
fe067ad9 SP |
970 | =item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)> |
971 | ||
972 | It returns a boolean of the derived property Comp_Ex | |
973 | (Full_Composition_Exclusion). This property is generated from | |
974 | Composition Exclusions + Singletons + Non-Starter Decompositions. | |
975 | ||
976 | =item C<$NFD_is_NO = isNFD_NO($code_point)> | |
977 | ||
978 | It returns a boolean of the derived property NFD_NO | |
979 | (NFD_Quick_Check=No). | |
980 | ||
981 | =item C<$NFC_is_NO = isNFC_NO($code_point)> | |
982 | ||
983 | It returns a boolean of the derived property NFC_NO | |
984 | (NFC_Quick_Check=No). | |
985 | ||
986 | =item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)> | |
987 | ||
988 | It returns a boolean of the derived property NFC_MAYBE | |
989 | (NFC_Quick_Check=Maybe). | |
990 | ||
991 | =item C<$NFKD_is_NO = isNFKD_NO($code_point)> | |
992 | ||
993 | It returns a boolean of the derived property NFKD_NO | |
994 | (NFKD_Quick_Check=No). | |
995 | ||
996 | =item C<$NFKC_is_NO = isNFKC_NO($code_point)> | |
997 | ||
998 | It returns a boolean of the derived property NFKC_NO | |
999 | (NFKC_Quick_Check=No). | |
1000 | ||
1001 | =item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)> | |
1002 | ||
1003 | It returns a boolean of the derived property NFKC_MAYBE | |
1004 | (NFKC_Quick_Check=Maybe). | |
2a204b45 JH |
1005 | |
1006 | =back | |
1007 | ||
628bbff0 | 1008 | =head1 EXPORT |
2a204b45 JH |
1009 | |
1010 | C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. | |
1011 | ||
1012 | C<normalize> and other some functions: on request. | |
1013 | ||
628bbff0 RGS |
1014 | =head1 CAVEATS |
1015 | ||
1016 | =over 4 | |
1017 | ||
1018 | =item Perl's version vs. Unicode version | |
1019 | ||
1020 | Since this module refers to perl core's Unicode database in the directory | |
1021 | F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of | |
1022 | normalization implemented by this module depends on your perl's version. | |
1023 | ||
fe067ad9 SP |
1024 | perl's version implemented Unicode version |
1025 | 5.6.1 3.0.1 | |
1026 | 5.7.2 3.1.0 | |
1027 | 5.7.3 3.1.1 (normalization is same as 3.1.0) | |
1028 | 5.8.0 3.2.0 | |
1029 | 5.8.1-5.8.3 4.0.0 | |
1030 | 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0) | |
1031 | 5.8.7-5.8.8 4.1.0 | |
51683ce6 | 1032 | 5.10.0 5.0.0 |
dd8fe490 | 1033 | 5.8.9, 5.10.1 5.1.0 |
65e3945f | 1034 | 5.12.x 5.2.0 |
a257baa9 CBW |
1035 | 5.14.x 6.0.0 |
1036 | 5.16.x 6.1.0 | |
65e3945f | 1037 | 5.18.x 6.2.0 |
628bbff0 RGS |
1038 | |
1039 | =item Correction of decomposition mapping | |
1040 | ||
1041 | In older Unicode versions, a small number of characters (all of which are | |
1042 | CJK compatibility ideographs as far as they have been found) may have | |
1043 | an erroneous decomposition mapping (see F<NormalizationCorrections.txt>). | |
1044 | Anyhow, this module will neither refer to F<NormalizationCorrections.txt> | |
1045 | nor provide any specific version of normalization. Therefore this module | |
1046 | running on an older perl with an older Unicode database may use | |
1047 | the erroneous decomposition mapping blindly conforming to the Unicode database. | |
1048 | ||
1049 | =item Revised definition of canonical composition | |
1050 | ||
1051 | In Unicode 4.1.0, the definition D2 of canonical composition (which | |
1052 | affects NFC and NFKC) has been changed (see Public Review Issue #29 | |
1053 | and recent UAX #15). This module has used the newer definition | |
1054 | since the version 0.07 (Oct 31, 2001). | |
2b8d773d | 1055 | This module will not support the normalization according to the older |
628bbff0 RGS |
1056 | definition, even if the Unicode version implemented by perl is |
1057 | lower than 4.1.0. | |
1058 | ||
1059 | =back | |
1060 | ||
2a204b45 JH |
1061 | =head1 AUTHOR |
1062 | ||
a092bcfd | 1063 | SADAHIRO Tomoyuki <SADAHIRO@cpan.org> |
2a204b45 | 1064 | |
b7657688 | 1065 | Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved. |
2a204b45 | 1066 | |
628bbff0 RGS |
1067 | This module is free software; you can redistribute it |
1068 | and/or modify it under the same terms as Perl itself. | |
2a204b45 JH |
1069 | |
1070 | =head1 SEE ALSO | |
1071 | ||
1072 | =over 4 | |
1073 | ||
e524f5b2 | 1074 | =item http://www.unicode.org/reports/tr15/ |
2a204b45 JH |
1075 | |
1076 | Unicode Normalization Forms - UAX #15 | |
1077 | ||
fe067ad9 SP |
1078 | =item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt |
1079 | ||
1080 | Composition Exclusion Table | |
1081 | ||
14e6b36c | 1082 | =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt |
8f118dcd JH |
1083 | |
1084 | Derived Normalization Properties | |
1085 | ||
628bbff0 RGS |
1086 | =item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt |
1087 | ||
1088 | Normalization Corrections | |
1089 | ||
1090 | =item http://www.unicode.org/review/pr-29.html | |
1091 | ||
1092 | Public Review Issue #29: Normalization Issue | |
1093 | ||
82e740b6 NC |
1094 | =item http://www.unicode.org/notes/tn5/ |
1095 | ||
1096 | Canonical Equivalence in Applications - UTN #5 | |
1097 | ||
2a204b45 JH |
1098 | =back |
1099 | ||
1100 | =cut |