Commit | Line | Data |
---|---|---|
ac5ea531 JH |
1 | package Unicode::Normalize; |
2 | ||
3 | use 5.006; | |
4 | use strict; | |
5 | use warnings; | |
6 | use Carp; | |
7 | ||
e524f5b2 NC |
8 | no warnings 'utf8'; |
9 | ||
f7becd03 | 10 | our $VERSION = '1.32'; |
ac5ea531 JH |
11 | our $PACKAGE = __PACKAGE__; |
12 | ||
ac5ea531 | 13 | our @EXPORT = qw( NFC NFD NFKC NFKD ); |
2a204b45 JH |
14 | our @EXPORT_OK = qw( |
15 | normalize decompose reorder compose | |
8f118dcd JH |
16 | checkNFD checkNFKD checkNFC checkNFKC check |
17 | getCanon getCompat getComposite getCombinClass | |
18 | isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex | |
19 | isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE | |
a96160d6 CBW |
20 | FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter |
21 | normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial | |
8f118dcd JH |
22 | ); |
23 | our %EXPORT_TAGS = ( | |
24 | all => [ @EXPORT, @EXPORT_OK ], | |
25 | normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ], | |
26 | check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ], | |
82e740b6 | 27 | fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ], |
2a204b45 | 28 | ); |
ac5ea531 | 29 | |
fe067ad9 | 30 | ## |
788ba0f4 | 31 | ## utilities for tests |
fe067ad9 SP |
32 | ## |
33 | ||
907eab3f KW |
34 | # No EBCDIC support on early perls |
35 | *to_native = ($::IS_ASCII || $] < 5.008) | |
36 | ? sub { return shift } | |
37 | : sub { utf8::unicode_to_native(shift) }; | |
38 | ||
39 | *from_native = ($::IS_ASCII || $] < 5.008) | |
40 | ? sub { return shift } | |
41 | : sub { utf8::native_to_unicode(shift) }; | |
42 | ||
43 | # The .t files are all in terms of Unicode, so xlate to/from native | |
44 | sub dot_t_pack_U { | |
45 | return pack('U*', map { to_native($_) } @_); | |
9f1f04a1 RGS |
46 | } |
47 | ||
907eab3f | 48 | sub dot_t_unpack_U { |
3ece07bc SH |
49 | |
50 | # The empty pack returns an empty UTF-8 string, so the effect is to force | |
c6b7cc21 SH |
51 | # the shifted parameter into being UTF-8. This allows this to work on |
52 | # Perl 5.6, where there is no utf8::upgrade(). | |
907eab3f | 53 | return map { from_native($_) } unpack('U*', shift(@_).pack('U*')); |
2b8c1900 KW |
54 | } |
55 | ||
c0605d31 KW |
56 | sub get_printable_string ($) { |
57 | use bytes; | |
58 | my $s = shift; | |
59 | ||
60 | # DeMorgan's laws cause this to mean ascii printables | |
61 | return $s if $s =~ /[^[:^ascii:][:^print:]]/; | |
62 | ||
63 | return join " ", map { sprintf "\\x%02x", ord $_ } split "", $s; | |
64 | } | |
65 | ||
2b8c1900 KW |
66 | sub ok ($$;$) { |
67 | my $count_ref = shift; # Test number in caller | |
68 | my $p = my $r = shift; | |
c0605d31 | 69 | my $x; |
2b8c1900 | 70 | if (@_) { |
c0605d31 KW |
71 | $x = shift; |
72 | $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x; | |
2b8c1900 KW |
73 | } |
74 | ||
75 | print $p ? "ok" : "not ok", ' ', ++$$count_ref, "\n"; | |
c0605d31 KW |
76 | |
77 | return if $p; | |
78 | ||
79 | my (undef, $file, $line) = caller(1); | |
80 | print STDERR "# Failed test $$count_ref at $file line $line\n"; | |
81 | ||
82 | return unless defined $x; | |
83 | ||
84 | print STDERR "# got ", get_printable_string($r), "\n"; | |
85 | print STDERR "# expected ", get_printable_string($x), "\n"; | |
9f1f04a1 RGS |
86 | } |
87 | ||
a96160d6 CBW |
88 | require Exporter; |
89 | ||
c6b7cc21 | 90 | ##### The above part is common to XS and PP ##### |
95f3e8d2 | 91 | |
b9a5a78f N |
92 | our @ISA = qw(Exporter); |
93 | use XSLoader (); | |
94 | XSLoader::load( 'Unicode::Normalize', $VERSION ); | |
95f3e8d2 | 95 | |
c6b7cc21 | 96 | ##### The below part is common to XS and PP ##### |
82e740b6 NC |
97 | |
98 | ## | |
a96160d6 | 99 | ## normalize |
82e740b6 NC |
100 | ## |
101 | ||
82e740b6 NC |
102 | sub FCD ($) { |
103 | my $str = shift; | |
104 | return checkFCD($str) ? $str : NFD($str); | |
105 | } | |
82e740b6 NC |
106 | |
107 | our %formNorm = ( | |
108 | NFC => \&NFC, C => \&NFC, | |
109 | NFD => \&NFD, D => \&NFD, | |
110 | NFKC => \&NFKC, KC => \&NFKC, | |
111 | NFKD => \&NFKD, KD => \&NFKD, | |
112 | FCD => \&FCD, FCC => \&FCC, | |
113 | ); | |
114 | ||
ac5ea531 JH |
115 | sub normalize($$) |
116 | { | |
d85850a7 | 117 | my $form = shift; |
f027f502 | 118 | my $str = shift; |
fe067ad9 SP |
119 | if (exists $formNorm{$form}) { |
120 | return $formNorm{$form}->($str); | |
121 | } | |
122 | croak($PACKAGE."::normalize: invalid form name: $form"); | |
ac5ea531 JH |
123 | } |
124 | ||
a96160d6 CBW |
125 | ## |
126 | ## partial | |
127 | ## | |
128 | ||
129 | sub normalize_partial ($$) { | |
130 | if (exists $formNorm{$_[0]}) { | |
131 | my $n = normalize($_[0], $_[1]); | |
132 | my($p, $u) = splitOnLastStarter($n); | |
133 | $_[1] = $u; | |
134 | return $p; | |
135 | } | |
136 | croak($PACKAGE."::normalize_partial: invalid form name: $_[0]"); | |
137 | } | |
138 | ||
139 | sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) } | |
140 | sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) } | |
141 | sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) } | |
142 | sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) } | |
82e740b6 NC |
143 | |
144 | ## | |
a96160d6 | 145 | ## check |
82e740b6 NC |
146 | ## |
147 | ||
148 | our %formCheck = ( | |
149 | NFC => \&checkNFC, C => \&checkNFC, | |
150 | NFD => \&checkNFD, D => \&checkNFD, | |
151 | NFKC => \&checkNFKC, KC => \&checkNFKC, | |
152 | NFKD => \&checkNFKD, KD => \&checkNFKD, | |
153 | FCD => \&checkFCD, FCC => \&checkFCC, | |
154 | ); | |
155 | ||
8f118dcd JH |
156 | sub check($$) |
157 | { | |
158 | my $form = shift; | |
f027f502 | 159 | my $str = shift; |
fe067ad9 SP |
160 | if (exists $formCheck{$form}) { |
161 | return $formCheck{$form}->($str); | |
162 | } | |
163 | croak($PACKAGE."::check: invalid form name: $form"); | |
8f118dcd JH |
164 | } |
165 | ||
ac5ea531 JH |
166 | 1; |
167 | __END__ | |
2a204b45 JH |
168 | |
169 | =head1 NAME | |
170 | ||
f027f502 | 171 | Unicode::Normalize - Unicode Normalization Forms |
2a204b45 JH |
172 | |
173 | =head1 SYNOPSIS | |
174 | ||
a092bcfd RGS |
175 | (1) using function names exported by default: |
176 | ||
2a204b45 JH |
177 | use Unicode::Normalize; |
178 | ||
8f118dcd JH |
179 | $NFD_string = NFD($string); # Normalization Form D |
180 | $NFC_string = NFC($string); # Normalization Form C | |
181 | $NFKD_string = NFKD($string); # Normalization Form KD | |
182 | $NFKC_string = NFKC($string); # Normalization Form KC | |
2a204b45 | 183 | |
a092bcfd | 184 | (2) using function names exported on request: |
2a204b45 JH |
185 | |
186 | use Unicode::Normalize 'normalize'; | |
187 | ||
8f118dcd JH |
188 | $NFD_string = normalize('D', $string); # Normalization Form D |
189 | $NFC_string = normalize('C', $string); # Normalization Form C | |
190 | $NFKD_string = normalize('KD', $string); # Normalization Form KD | |
191 | $NFKC_string = normalize('KC', $string); # Normalization Form KC | |
2a204b45 JH |
192 | |
193 | =head1 DESCRIPTION | |
194 | ||
00f2676f JH |
195 | Parameters: |
196 | ||
3baae3fa | 197 | C<$string> is used as a string under character semantics (see L<perlunicode>). |
00f2676f | 198 | |
fe067ad9 | 199 | C<$code_point> should be an unsigned integer representing a Unicode code point. |
00f2676f | 200 | |
c6b7cc21 SH |
201 | Note: Between XSUB and pure Perl, there is an incompatibility |
202 | about the interpretation of C<$code_point> as a decimal number. | |
203 | XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not. | |
204 | Do not use a floating point nor a negative sign in C<$code_point>. | |
00f2676f | 205 | |
d85850a7 | 206 | =head2 Normalization Forms |
2a204b45 JH |
207 | |
208 | =over 4 | |
209 | ||
8f118dcd | 210 | =item C<$NFD_string = NFD($string)> |
2a204b45 | 211 | |
fe067ad9 | 212 | It returns the Normalization Form D (formed by canonical decomposition). |
2a204b45 | 213 | |
8f118dcd | 214 | =item C<$NFC_string = NFC($string)> |
2a204b45 | 215 | |
fe067ad9 | 216 | It returns the Normalization Form C (formed by canonical decomposition |
2a204b45 JH |
217 | followed by canonical composition). |
218 | ||
8f118dcd | 219 | =item C<$NFKD_string = NFKD($string)> |
2a204b45 | 220 | |
fe067ad9 | 221 | It returns the Normalization Form KD (formed by compatibility decomposition). |
2a204b45 | 222 | |
8f118dcd | 223 | =item C<$NFKC_string = NFKC($string)> |
2a204b45 | 224 | |
fe067ad9 | 225 | It returns the Normalization Form KC (formed by compatibility decomposition |
2a204b45 JH |
226 | followed by B<canonical> composition). |
227 | ||
82e740b6 NC |
228 | =item C<$FCD_string = FCD($string)> |
229 | ||
230 | If the given string is in FCD ("Fast C or D" form; cf. UTN #5), | |
fe067ad9 | 231 | it returns the string without modification; otherwise it returns an FCD string. |
82e740b6 NC |
232 | |
233 | Note: FCD is not always unique, then plural forms may be equivalent | |
234 | each other. C<FCD()> will return one of these equivalent forms. | |
235 | ||
236 | =item C<$FCC_string = FCC($string)> | |
237 | ||
fe067ad9 | 238 | It returns the FCC form ("Fast C Contiguous"; cf. UTN #5). |
82e740b6 | 239 | |
e524f5b2 | 240 | Note: FCC is unique, as well as four normalization forms (NF*). |
82e740b6 | 241 | |
8f118dcd | 242 | =item C<$normalized_string = normalize($form_name, $string)> |
2a204b45 | 243 | |
fe067ad9 SP |
244 | It returns the normalization form of C<$form_name>. |
245 | ||
2a204b45 JH |
246 | As C<$form_name>, one of the following names must be given. |
247 | ||
82e740b6 NC |
248 | 'C' or 'NFC' for Normalization Form C (UAX #15) |
249 | 'D' or 'NFD' for Normalization Form D (UAX #15) | |
250 | 'KC' or 'NFKC' for Normalization Form KC (UAX #15) | |
251 | 'KD' or 'NFKD' for Normalization Form KD (UAX #15) | |
252 | ||
253 | 'FCD' for "Fast C or D" Form (UTN #5) | |
254 | 'FCC' for "Fast C Contiguous" (UTN #5) | |
2a204b45 JH |
255 | |
256 | =back | |
257 | ||
8f118dcd JH |
258 | =head2 Decomposition and Composition |
259 | ||
260 | =over 4 | |
261 | ||
fe067ad9 | 262 | =item C<$decomposed_string = decompose($string [, $useCompatMapping])> |
8f118dcd | 263 | |
fe067ad9 SP |
264 | It returns the concatenation of the decomposition of each character |
265 | in the string. | |
8f118dcd | 266 | |
fe067ad9 SP |
267 | If the second parameter (a boolean) is omitted or false, |
268 | the decomposition is canonical decomposition; | |
269 | if the second parameter (a boolean) is true, | |
270 | the decomposition is compatibility decomposition. | |
8f118dcd | 271 | |
fe067ad9 | 272 | The string returned is not always in NFD/NFKD. Reordering may be required. |
8f118dcd | 273 | |
3baae3fa KW |
274 | $NFD_string = reorder(decompose($string)); # eq. to NFD() |
275 | $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD() | |
8f118dcd | 276 | |
fe067ad9 | 277 | =item C<$reordered_string = reorder($string)> |
8f118dcd | 278 | |
fe067ad9 SP |
279 | It returns the result of reordering the combining characters |
280 | according to Canonical Ordering Behavior. | |
8f118dcd | 281 | |
fe067ad9 SP |
282 | For example, when you have a list of NFD/NFKD strings, |
283 | you can get the concatenated NFD/NFKD string from them, by saying | |
8f118dcd JH |
284 | |
285 | $concat_NFD = reorder(join '', @NFD_strings); | |
286 | $concat_NFKD = reorder(join '', @NFKD_strings); | |
287 | ||
fe067ad9 | 288 | =item C<$composed_string = compose($string)> |
8f118dcd | 289 | |
fe067ad9 SP |
290 | It returns the result of canonical composition |
291 | without applying any decomposition. | |
8f118dcd | 292 | |
fe067ad9 SP |
293 | For example, when you have a NFD/NFKD string, |
294 | you can get its NFC/NFKC string, by saying | |
8f118dcd JH |
295 | |
296 | $NFC_string = compose($NFD_string); | |
297 | $NFKC_string = compose($NFKD_string); | |
298 | ||
a96160d6 CBW |
299 | =item C<($processed, $unprocessed) = splitOnLastStarter($normalized)> |
300 | ||
301 | It returns two strings: the first one, C<$processed>, is a part | |
302 | before the last starter, and the second one, C<$unprocessed> is | |
303 | another part after the first part. A starter is a character having | |
304 | a combining class of zero (see UAX #15). | |
305 | ||
306 | Note that C<$processed> may be empty (when C<$normalized> contains no | |
307 | starter or starts with the last starter), and then C<$unprocessed> | |
308 | should be equal to the entire C<$normalized>. | |
309 | ||
310 | When you have a C<$normalized> string and an C<$unnormalized> string | |
311 | following it, a simple concatenation is wrong: | |
312 | ||
3baae3fa | 313 | $concat = $normalized . normalize($form, $unnormalized); # wrong! |
a96160d6 CBW |
314 | |
315 | Instead of it, do like this: | |
316 | ||
3baae3fa KW |
317 | ($processed, $unprocessed) = splitOnLastStarter($normalized); |
318 | $concat = $processed . normalize($form,$unprocessed.$unnormalized); | |
a96160d6 CBW |
319 | |
320 | C<splitOnLastStarter()> should be called with a pre-normalized parameter | |
321 | C<$normalized>, that is in the same form as C<$form> you want. | |
322 | ||
323 | If you have an array of C<@string> that should be concatenated and then | |
324 | normalized, you can do like this: | |
325 | ||
326 | my $result = ""; | |
327 | my $unproc = ""; | |
328 | foreach my $str (@string) { | |
329 | $unproc .= $str; | |
330 | my $n = normalize($form, $unproc); | |
331 | my($p, $u) = splitOnLastStarter($n); | |
332 | $result .= $p; | |
333 | $unproc = $u; | |
334 | } | |
335 | $result .= $unproc; | |
336 | # instead of normalize($form, join('', @string)) | |
337 | ||
338 | =item C<$processed = normalize_partial($form, $unprocessed)> | |
339 | ||
340 | A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>. | |
341 | Note that C<$unprocessed> will be modified as a side-effect. | |
342 | ||
343 | If you have an array of C<@string> that should be concatenated and then | |
344 | normalized, you can do like this: | |
345 | ||
346 | my $result = ""; | |
347 | my $unproc = ""; | |
348 | foreach my $str (@string) { | |
349 | $unproc .= $str; | |
350 | $result .= normalize_partial($form, $unproc); | |
351 | } | |
352 | $result .= $unproc; | |
353 | # instead of normalize($form, join('', @string)) | |
354 | ||
355 | =item C<$processed = NFD_partial($unprocessed)> | |
356 | ||
357 | It does like C<normalize_partial('NFD', $unprocessed)>. | |
358 | Note that C<$unprocessed> will be modified as a side-effect. | |
359 | ||
360 | =item C<$processed = NFC_partial($unprocessed)> | |
361 | ||
362 | It does like C<normalize_partial('NFC', $unprocessed)>. | |
363 | Note that C<$unprocessed> will be modified as a side-effect. | |
364 | ||
365 | =item C<$processed = NFKD_partial($unprocessed)> | |
366 | ||
367 | It does like C<normalize_partial('NFKD', $unprocessed)>. | |
368 | Note that C<$unprocessed> will be modified as a side-effect. | |
369 | ||
370 | =item C<$processed = NFKC_partial($unprocessed)> | |
371 | ||
372 | It does like C<normalize_partial('NFKC', $unprocessed)>. | |
373 | Note that C<$unprocessed> will be modified as a side-effect. | |
374 | ||
8f118dcd JH |
375 | =back |
376 | ||
377 | =head2 Quick Check | |
378 | ||
f7becd03 | 379 | (see Annex 8, UAX #15; and F<lib/unicore/DerivedNormalizationProps.txt>) |
8f118dcd JH |
380 | |
381 | The following functions check whether the string is in that normalization form. | |
382 | ||
fe067ad9 | 383 | The result returned will be one of the following: |
8f118dcd JH |
384 | |
385 | YES The string is in that normalization form. | |
386 | NO The string is not in that normalization form. | |
387 | MAYBE Dubious. Maybe yes, maybe no. | |
388 | ||
389 | =over 4 | |
390 | ||
391 | =item C<$result = checkNFD($string)> | |
392 | ||
fe067ad9 | 393 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. |
8f118dcd JH |
394 | |
395 | =item C<$result = checkNFC($string)> | |
396 | ||
fe067ad9 | 397 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
628bbff0 | 398 | C<undef> if C<MAYBE>. |
8f118dcd JH |
399 | |
400 | =item C<$result = checkNFKD($string)> | |
401 | ||
fe067ad9 | 402 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. |
8f118dcd JH |
403 | |
404 | =item C<$result = checkNFKC($string)> | |
405 | ||
fe067ad9 | 406 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
628bbff0 | 407 | C<undef> if C<MAYBE>. |
8f118dcd | 408 | |
82e740b6 NC |
409 | =item C<$result = checkFCD($string)> |
410 | ||
fe067ad9 | 411 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>. |
82e740b6 NC |
412 | |
413 | =item C<$result = checkFCC($string)> | |
414 | ||
fe067ad9 | 415 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
628bbff0 | 416 | C<undef> if C<MAYBE>. |
82e740b6 | 417 | |
fe067ad9 | 418 | Note: If a string is not in FCD, it must not be in FCC. |
82e740b6 NC |
419 | So C<checkFCC($not_FCD_string)> should return C<NO>. |
420 | ||
8f118dcd JH |
421 | =item C<$result = check($form_name, $string)> |
422 | ||
fe067ad9 | 423 | It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>; |
628bbff0 | 424 | C<undef> if C<MAYBE>. |
8f118dcd | 425 | |
628bbff0 RGS |
426 | As C<$form_name>, one of the following names must be given. |
427 | ||
428 | 'C' or 'NFC' for Normalization Form C (UAX #15) | |
429 | 'D' or 'NFD' for Normalization Form D (UAX #15) | |
430 | 'KC' or 'NFKC' for Normalization Form KC (UAX #15) | |
431 | 'KD' or 'NFKD' for Normalization Form KD (UAX #15) | |
432 | ||
433 | 'FCD' for "Fast C or D" Form (UTN #5) | |
434 | 'FCC' for "Fast C Contiguous" (UTN #5) | |
8f118dcd JH |
435 | |
436 | =back | |
437 | ||
438 | B<Note> | |
439 | ||
82e740b6 NC |
440 | In the cases of NFD, NFKD, and FCD, the answer must be |
441 | either C<YES> or C<NO>. The answer C<MAYBE> may be returned | |
442 | in the cases of NFC, NFKC, and FCC. | |
8f118dcd | 443 | |
82e740b6 NC |
444 | A C<MAYBE> string should contain at least one combining character |
445 | or the like. For example, C<COMBINING ACUTE ACCENT> has | |
8f118dcd | 446 | the MAYBE_NFC/MAYBE_NFKC property. |
82e740b6 | 447 | |
8f118dcd JH |
448 | Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")> |
449 | and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>. | |
f027f502 | 450 | C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC |
8f118dcd JH |
451 | (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">), |
452 | while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC. | |
453 | ||
628bbff0 RGS |
454 | If you want to check exactly, compare the string with its NFC/NFKC/FCC. |
455 | ||
456 | if ($string eq NFC($string)) { | |
a96160d6 | 457 | # $string is exactly normalized in NFC; |
628bbff0 | 458 | } else { |
a96160d6 | 459 | # $string is not normalized in NFC; |
628bbff0 | 460 | } |
8f118dcd | 461 | |
628bbff0 | 462 | if ($string eq NFKC($string)) { |
a96160d6 | 463 | # $string is exactly normalized in NFKC; |
628bbff0 | 464 | } else { |
a96160d6 | 465 | # $string is not normalized in NFKC; |
628bbff0 | 466 | } |
8f118dcd | 467 | |
2a204b45 JH |
468 | =head2 Character Data |
469 | ||
470 | These functions are interface of character data used internally. | |
d0ed0342 BG |
471 | If you want only to get Unicode normalization forms, you don't need |
472 | call them yourself. | |
2a204b45 JH |
473 | |
474 | =over 4 | |
475 | ||
fe067ad9 | 476 | =item C<$canonical_decomposition = getCanon($code_point)> |
2a204b45 | 477 | |
fe067ad9 SP |
478 | If the character is canonically decomposable (including Hangul Syllables), |
479 | it returns the (full) canonical decomposition as a string. | |
480 | Otherwise it returns C<undef>. | |
8f118dcd | 481 | |
fe067ad9 SP |
482 | B<Note:> According to the Unicode standard, the canonical decomposition |
483 | of the character that is not canonically decomposable is same as | |
484 | the character itself. | |
8f118dcd | 485 | |
fe067ad9 | 486 | =item C<$compatibility_decomposition = getCompat($code_point)> |
2a204b45 | 487 | |
fe067ad9 SP |
488 | If the character is compatibility decomposable (including Hangul Syllables), |
489 | it returns the (full) compatibility decomposition as a string. | |
490 | Otherwise it returns C<undef>. | |
2a204b45 | 491 | |
fe067ad9 SP |
492 | B<Note:> According to the Unicode standard, the compatibility decomposition |
493 | of the character that is not compatibility decomposable is same as | |
494 | the character itself. | |
2a204b45 | 495 | |
fe067ad9 | 496 | =item C<$code_point_composite = getComposite($code_point_here, $code_point_next)> |
2a204b45 | 497 | |
fe067ad9 | 498 | If two characters here and next (as code points) are composable |
8f118dcd | 499 | (including Hangul Jamo/Syllables and Composition Exclusions), |
fe067ad9 SP |
500 | it returns the code point of the composite. |
501 | ||
502 | If they are not composable, it returns C<undef>. | |
2a204b45 | 503 | |
fe067ad9 | 504 | =item C<$combining_class = getCombinClass($code_point)> |
2a204b45 | 505 | |
fe067ad9 | 506 | It returns the combining class (as an integer) of the character. |
2a204b45 | 507 | |
fe067ad9 | 508 | =item C<$may_be_composed_with_prev_char = isComp2nd($code_point)> |
2a204b45 | 509 | |
fe067ad9 SP |
510 | It returns a boolean whether the character of the specified codepoint |
511 | may be composed with the previous one in a certain composition | |
512 | (including Hangul Compositions, but excluding | |
513 | Composition Exclusions and Non-Starter Decompositions). | |
2a204b45 | 514 | |
fe067ad9 | 515 | =item C<$is_exclusion = isExclusion($code_point)> |
8f118dcd | 516 | |
fe067ad9 | 517 | It returns a boolean whether the code point is a composition exclusion. |
8f118dcd | 518 | |
fe067ad9 | 519 | =item C<$is_singleton = isSingleton($code_point)> |
8f118dcd | 520 | |
fe067ad9 | 521 | It returns a boolean whether the code point is a singleton |
8f118dcd | 522 | |
fe067ad9 | 523 | =item C<$is_non_starter_decomposition = isNonStDecomp($code_point)> |
8f118dcd | 524 | |
fe067ad9 | 525 | It returns a boolean whether the code point has Non-Starter Decomposition. |
8f118dcd | 526 | |
fe067ad9 SP |
527 | =item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)> |
528 | ||
529 | It returns a boolean of the derived property Comp_Ex | |
530 | (Full_Composition_Exclusion). This property is generated from | |
531 | Composition Exclusions + Singletons + Non-Starter Decompositions. | |
532 | ||
533 | =item C<$NFD_is_NO = isNFD_NO($code_point)> | |
534 | ||
535 | It returns a boolean of the derived property NFD_NO | |
536 | (NFD_Quick_Check=No). | |
537 | ||
538 | =item C<$NFC_is_NO = isNFC_NO($code_point)> | |
539 | ||
540 | It returns a boolean of the derived property NFC_NO | |
541 | (NFC_Quick_Check=No). | |
542 | ||
543 | =item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)> | |
544 | ||
545 | It returns a boolean of the derived property NFC_MAYBE | |
546 | (NFC_Quick_Check=Maybe). | |
547 | ||
548 | =item C<$NFKD_is_NO = isNFKD_NO($code_point)> | |
549 | ||
550 | It returns a boolean of the derived property NFKD_NO | |
551 | (NFKD_Quick_Check=No). | |
552 | ||
553 | =item C<$NFKC_is_NO = isNFKC_NO($code_point)> | |
554 | ||
555 | It returns a boolean of the derived property NFKC_NO | |
556 | (NFKC_Quick_Check=No). | |
557 | ||
558 | =item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)> | |
559 | ||
560 | It returns a boolean of the derived property NFKC_MAYBE | |
561 | (NFKC_Quick_Check=Maybe). | |
2a204b45 JH |
562 | |
563 | =back | |
564 | ||
628bbff0 | 565 | =head1 EXPORT |
2a204b45 JH |
566 | |
567 | C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default. | |
568 | ||
569 | C<normalize> and other some functions: on request. | |
570 | ||
628bbff0 RGS |
571 | =head1 CAVEATS |
572 | ||
573 | =over 4 | |
574 | ||
575 | =item Perl's version vs. Unicode version | |
576 | ||
577 | Since this module refers to perl core's Unicode database in the directory | |
578 | F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of | |
3ece07bc SH |
579 | normalization implemented by this module depends on what has been |
580 | compiled into your perl. The following table lists the default Unicode | |
581 | version that comes with various perl versions. (It is possible to change | |
582 | the Unicode version in any perl version to be any earlier Unicode version, | |
583 | so one could cause Unicode 3.2 to be used in any perl version starting with | |
3baae3fa | 584 | 5.8.0. Read F<C<$Config{privlib}>/unicore/README.perl> for details. |
628bbff0 | 585 | |
fe067ad9 SP |
586 | perl's version implemented Unicode version |
587 | 5.6.1 3.0.1 | |
588 | 5.7.2 3.1.0 | |
589 | 5.7.3 3.1.1 (normalization is same as 3.1.0) | |
590 | 5.8.0 3.2.0 | |
3ece07bc SH |
591 | 5.8.1-5.8.3 4.0.0 |
592 | 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0) | |
593 | 5.8.7-5.8.8 4.1.0 | |
51683ce6 | 594 | 5.10.0 5.0.0 |
3ece07bc | 595 | 5.8.9, 5.10.1 5.1.0 |
65e3945f | 596 | 5.12.x 5.2.0 |
a257baa9 CBW |
597 | 5.14.x 6.0.0 |
598 | 5.16.x 6.1.0 | |
65e3945f | 599 | 5.18.x 6.2.0 |
3ece07bc SH |
600 | 5.20.x 6.3.0 |
601 | 5.22.x 7.0.0 | |
628bbff0 RGS |
602 | |
603 | =item Correction of decomposition mapping | |
604 | ||
605 | In older Unicode versions, a small number of characters (all of which are | |
606 | CJK compatibility ideographs as far as they have been found) may have | |
f7becd03 KW |
607 | an erroneous decomposition mapping (see |
608 | F<lib/unicore/NormalizationCorrections.txt>). | |
609 | Anyhow, this module will neither refer to | |
610 | F<lib/unicore/NormalizationCorrections.txt> | |
628bbff0 RGS |
611 | nor provide any specific version of normalization. Therefore this module |
612 | running on an older perl with an older Unicode database may use | |
613 | the erroneous decomposition mapping blindly conforming to the Unicode database. | |
614 | ||
615 | =item Revised definition of canonical composition | |
616 | ||
617 | In Unicode 4.1.0, the definition D2 of canonical composition (which | |
618 | affects NFC and NFKC) has been changed (see Public Review Issue #29 | |
619 | and recent UAX #15). This module has used the newer definition | |
620 | since the version 0.07 (Oct 31, 2001). | |
2b8d773d | 621 | This module will not support the normalization according to the older |
628bbff0 RGS |
622 | definition, even if the Unicode version implemented by perl is |
623 | lower than 4.1.0. | |
624 | ||
625 | =back | |
626 | ||
2a204b45 JH |
627 | =head1 AUTHOR |
628 | ||
a092bcfd | 629 | SADAHIRO Tomoyuki <SADAHIRO@cpan.org> |
2a204b45 | 630 | |
c6b7cc21 SH |
631 | Currently maintained by <perl5-porters@perl.org> |
632 | ||
b7657688 | 633 | Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved. |
2a204b45 | 634 | |
662aea32 CBW |
635 | =head1 LICENSE |
636 | ||
628bbff0 RGS |
637 | This module is free software; you can redistribute it |
638 | and/or modify it under the same terms as Perl itself. | |
2a204b45 JH |
639 | |
640 | =head1 SEE ALSO | |
641 | ||
642 | =over 4 | |
643 | ||
1a2bd084 | 644 | =item L<http://www.unicode.org/reports/tr15/> |
2a204b45 JH |
645 | |
646 | Unicode Normalization Forms - UAX #15 | |
647 | ||
1a2bd084 | 648 | =item L<http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt> |
fe067ad9 SP |
649 | |
650 | Composition Exclusion Table | |
651 | ||
1a2bd084 | 652 | =item L<http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt> |
8f118dcd JH |
653 | |
654 | Derived Normalization Properties | |
655 | ||
1a2bd084 | 656 | =item L<http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt> |
628bbff0 RGS |
657 | |
658 | Normalization Corrections | |
659 | ||
1a2bd084 | 660 | =item L<http://www.unicode.org/review/pr-29.html> |
628bbff0 RGS |
661 | |
662 | Public Review Issue #29: Normalization Issue | |
663 | ||
1a2bd084 | 664 | =item L<http://www.unicode.org/notes/tn5/> |
82e740b6 NC |
665 | |
666 | Canonical Equivalence in Applications - UTN #5 | |
667 | ||
2a204b45 JH |
668 | =back |
669 | ||
670 | =cut |