This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
ParseXS - better support for duplicate ALIASes
[perl5.git] / dist / Unicode-Normalize / Normalize.pm
CommitLineData
ac5ea531
JH
1package Unicode::Normalize;
2
3use 5.006;
4use strict;
5use warnings;
6use Carp;
7
e524f5b2
NC
8no warnings 'utf8';
9
f7becd03 10our $VERSION = '1.32';
ac5ea531
JH
11our $PACKAGE = __PACKAGE__;
12
ac5ea531 13our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45
JH
14our @EXPORT_OK = qw(
15 normalize decompose reorder compose
8f118dcd
JH
16 checkNFD checkNFKD checkNFC checkNFKC check
17 getCanon getCompat getComposite getCombinClass
18 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
19 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
a96160d6
CBW
20 FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter
21 normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial
8f118dcd
JH
22);
23our %EXPORT_TAGS = (
24 all => [ @EXPORT, @EXPORT_OK ],
25 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
26 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
82e740b6 27 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
2a204b45 28);
ac5ea531 29
fe067ad9 30##
788ba0f4 31## utilities for tests
fe067ad9
SP
32##
33
907eab3f
KW
34 # No EBCDIC support on early perls
35*to_native = ($::IS_ASCII || $] < 5.008)
36 ? sub { return shift }
37 : sub { utf8::unicode_to_native(shift) };
38
39*from_native = ($::IS_ASCII || $] < 5.008)
40 ? sub { return shift }
41 : sub { utf8::native_to_unicode(shift) };
42
43# The .t files are all in terms of Unicode, so xlate to/from native
44sub dot_t_pack_U {
45 return pack('U*', map { to_native($_) } @_);
9f1f04a1
RGS
46}
47
907eab3f 48sub dot_t_unpack_U {
3ece07bc
SH
49
50 # The empty pack returns an empty UTF-8 string, so the effect is to force
c6b7cc21
SH
51 # the shifted parameter into being UTF-8. This allows this to work on
52 # Perl 5.6, where there is no utf8::upgrade().
907eab3f 53 return map { from_native($_) } unpack('U*', shift(@_).pack('U*'));
2b8c1900
KW
54}
55
c0605d31
KW
56sub get_printable_string ($) {
57 use bytes;
58 my $s = shift;
59
60 # DeMorgan's laws cause this to mean ascii printables
61 return $s if $s =~ /[^[:^ascii:][:^print:]]/;
62
63 return join " ", map { sprintf "\\x%02x", ord $_ } split "", $s;
64}
65
2b8c1900
KW
66sub ok ($$;$) {
67 my $count_ref = shift; # Test number in caller
68 my $p = my $r = shift;
c0605d31 69 my $x;
2b8c1900 70 if (@_) {
c0605d31
KW
71 $x = shift;
72 $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x;
2b8c1900
KW
73 }
74
75 print $p ? "ok" : "not ok", ' ', ++$$count_ref, "\n";
c0605d31
KW
76
77 return if $p;
78
79 my (undef, $file, $line) = caller(1);
80 print STDERR "# Failed test $$count_ref at $file line $line\n";
81
82 return unless defined $x;
83
84 print STDERR "# got ", get_printable_string($r), "\n";
85 print STDERR "# expected ", get_printable_string($x), "\n";
9f1f04a1
RGS
86}
87
a96160d6
CBW
88require Exporter;
89
c6b7cc21 90##### The above part is common to XS and PP #####
95f3e8d2 91
b9a5a78f
N
92our @ISA = qw(Exporter);
93use XSLoader ();
94XSLoader::load( 'Unicode::Normalize', $VERSION );
95f3e8d2 95
c6b7cc21 96##### The below part is common to XS and PP #####
82e740b6
NC
97
98##
a96160d6 99## normalize
82e740b6
NC
100##
101
82e740b6
NC
102sub FCD ($) {
103 my $str = shift;
104 return checkFCD($str) ? $str : NFD($str);
105}
82e740b6
NC
106
107our %formNorm = (
108 NFC => \&NFC, C => \&NFC,
109 NFD => \&NFD, D => \&NFD,
110 NFKC => \&NFKC, KC => \&NFKC,
111 NFKD => \&NFKD, KD => \&NFKD,
112 FCD => \&FCD, FCC => \&FCC,
113);
114
ac5ea531
JH
115sub normalize($$)
116{
d85850a7 117 my $form = shift;
f027f502 118 my $str = shift;
fe067ad9
SP
119 if (exists $formNorm{$form}) {
120 return $formNorm{$form}->($str);
121 }
122 croak($PACKAGE."::normalize: invalid form name: $form");
ac5ea531
JH
123}
124
a96160d6
CBW
125##
126## partial
127##
128
129sub normalize_partial ($$) {
130 if (exists $formNorm{$_[0]}) {
131 my $n = normalize($_[0], $_[1]);
132 my($p, $u) = splitOnLastStarter($n);
133 $_[1] = $u;
134 return $p;
135 }
136 croak($PACKAGE."::normalize_partial: invalid form name: $_[0]");
137}
138
139sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) }
140sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) }
141sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) }
142sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) }
82e740b6
NC
143
144##
a96160d6 145## check
82e740b6
NC
146##
147
148our %formCheck = (
149 NFC => \&checkNFC, C => \&checkNFC,
150 NFD => \&checkNFD, D => \&checkNFD,
151 NFKC => \&checkNFKC, KC => \&checkNFKC,
152 NFKD => \&checkNFKD, KD => \&checkNFKD,
153 FCD => \&checkFCD, FCC => \&checkFCC,
154);
155
8f118dcd
JH
156sub check($$)
157{
158 my $form = shift;
f027f502 159 my $str = shift;
fe067ad9
SP
160 if (exists $formCheck{$form}) {
161 return $formCheck{$form}->($str);
162 }
163 croak($PACKAGE."::check: invalid form name: $form");
8f118dcd
JH
164}
165
ac5ea531
JH
1661;
167__END__
2a204b45
JH
168
169=head1 NAME
170
f027f502 171Unicode::Normalize - Unicode Normalization Forms
2a204b45
JH
172
173=head1 SYNOPSIS
174
a092bcfd
RGS
175(1) using function names exported by default:
176
2a204b45
JH
177 use Unicode::Normalize;
178
8f118dcd
JH
179 $NFD_string = NFD($string); # Normalization Form D
180 $NFC_string = NFC($string); # Normalization Form C
181 $NFKD_string = NFKD($string); # Normalization Form KD
182 $NFKC_string = NFKC($string); # Normalization Form KC
2a204b45 183
a092bcfd 184(2) using function names exported on request:
2a204b45
JH
185
186 use Unicode::Normalize 'normalize';
187
8f118dcd
JH
188 $NFD_string = normalize('D', $string); # Normalization Form D
189 $NFC_string = normalize('C', $string); # Normalization Form C
190 $NFKD_string = normalize('KD', $string); # Normalization Form KD
191 $NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45
JH
192
193=head1 DESCRIPTION
194
00f2676f
JH
195Parameters:
196
3baae3fa 197C<$string> is used as a string under character semantics (see L<perlunicode>).
00f2676f 198
fe067ad9 199C<$code_point> should be an unsigned integer representing a Unicode code point.
00f2676f 200
c6b7cc21
SH
201Note: Between XSUB and pure Perl, there is an incompatibility
202about the interpretation of C<$code_point> as a decimal number.
203XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
204Do not use a floating point nor a negative sign in C<$code_point>.
00f2676f 205
d85850a7 206=head2 Normalization Forms
2a204b45
JH
207
208=over 4
209
8f118dcd 210=item C<$NFD_string = NFD($string)>
2a204b45 211
fe067ad9 212It returns the Normalization Form D (formed by canonical decomposition).
2a204b45 213
8f118dcd 214=item C<$NFC_string = NFC($string)>
2a204b45 215
fe067ad9 216It returns the Normalization Form C (formed by canonical decomposition
2a204b45
JH
217followed by canonical composition).
218
8f118dcd 219=item C<$NFKD_string = NFKD($string)>
2a204b45 220
fe067ad9 221It returns the Normalization Form KD (formed by compatibility decomposition).
2a204b45 222
8f118dcd 223=item C<$NFKC_string = NFKC($string)>
2a204b45 224
fe067ad9 225It returns the Normalization Form KC (formed by compatibility decomposition
2a204b45
JH
226followed by B<canonical> composition).
227
82e740b6
NC
228=item C<$FCD_string = FCD($string)>
229
230If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
fe067ad9 231it returns the string without modification; otherwise it returns an FCD string.
82e740b6
NC
232
233Note: FCD is not always unique, then plural forms may be equivalent
234each other. C<FCD()> will return one of these equivalent forms.
235
236=item C<$FCC_string = FCC($string)>
237
fe067ad9 238It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
82e740b6 239
e524f5b2 240Note: FCC is unique, as well as four normalization forms (NF*).
82e740b6 241
8f118dcd 242=item C<$normalized_string = normalize($form_name, $string)>
2a204b45 243
fe067ad9
SP
244It returns the normalization form of C<$form_name>.
245
2a204b45
JH
246As C<$form_name>, one of the following names must be given.
247
82e740b6
NC
248 'C' or 'NFC' for Normalization Form C (UAX #15)
249 'D' or 'NFD' for Normalization Form D (UAX #15)
250 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
251 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
252
253 'FCD' for "Fast C or D" Form (UTN #5)
254 'FCC' for "Fast C Contiguous" (UTN #5)
2a204b45
JH
255
256=back
257
8f118dcd
JH
258=head2 Decomposition and Composition
259
260=over 4
261
fe067ad9 262=item C<$decomposed_string = decompose($string [, $useCompatMapping])>
8f118dcd 263
fe067ad9
SP
264It returns the concatenation of the decomposition of each character
265in the string.
8f118dcd 266
fe067ad9
SP
267If the second parameter (a boolean) is omitted or false,
268the decomposition is canonical decomposition;
269if the second parameter (a boolean) is true,
270the decomposition is compatibility decomposition.
8f118dcd 271
fe067ad9 272The string returned is not always in NFD/NFKD. Reordering may be required.
8f118dcd 273
3baae3fa
KW
274 $NFD_string = reorder(decompose($string)); # eq. to NFD()
275 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
8f118dcd 276
fe067ad9 277=item C<$reordered_string = reorder($string)>
8f118dcd 278
fe067ad9
SP
279It returns the result of reordering the combining characters
280according to Canonical Ordering Behavior.
8f118dcd 281
fe067ad9
SP
282For example, when you have a list of NFD/NFKD strings,
283you can get the concatenated NFD/NFKD string from them, by saying
8f118dcd
JH
284
285 $concat_NFD = reorder(join '', @NFD_strings);
286 $concat_NFKD = reorder(join '', @NFKD_strings);
287
fe067ad9 288=item C<$composed_string = compose($string)>
8f118dcd 289
fe067ad9
SP
290It returns the result of canonical composition
291without applying any decomposition.
8f118dcd 292
fe067ad9
SP
293For example, when you have a NFD/NFKD string,
294you can get its NFC/NFKC string, by saying
8f118dcd
JH
295
296 $NFC_string = compose($NFD_string);
297 $NFKC_string = compose($NFKD_string);
298
a96160d6
CBW
299=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)>
300
301It returns two strings: the first one, C<$processed>, is a part
302before the last starter, and the second one, C<$unprocessed> is
303another part after the first part. A starter is a character having
304a combining class of zero (see UAX #15).
305
306Note that C<$processed> may be empty (when C<$normalized> contains no
307starter or starts with the last starter), and then C<$unprocessed>
308should be equal to the entire C<$normalized>.
309
310When you have a C<$normalized> string and an C<$unnormalized> string
311following it, a simple concatenation is wrong:
312
3baae3fa 313 $concat = $normalized . normalize($form, $unnormalized); # wrong!
a96160d6
CBW
314
315Instead of it, do like this:
316
3baae3fa
KW
317 ($processed, $unprocessed) = splitOnLastStarter($normalized);
318 $concat = $processed . normalize($form,$unprocessed.$unnormalized);
a96160d6
CBW
319
320C<splitOnLastStarter()> should be called with a pre-normalized parameter
321C<$normalized>, that is in the same form as C<$form> you want.
322
323If you have an array of C<@string> that should be concatenated and then
324normalized, you can do like this:
325
326 my $result = "";
327 my $unproc = "";
328 foreach my $str (@string) {
329 $unproc .= $str;
330 my $n = normalize($form, $unproc);
331 my($p, $u) = splitOnLastStarter($n);
332 $result .= $p;
333 $unproc = $u;
334 }
335 $result .= $unproc;
336 # instead of normalize($form, join('', @string))
337
338=item C<$processed = normalize_partial($form, $unprocessed)>
339
340A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>.
341Note that C<$unprocessed> will be modified as a side-effect.
342
343If you have an array of C<@string> that should be concatenated and then
344normalized, you can do like this:
345
346 my $result = "";
347 my $unproc = "";
348 foreach my $str (@string) {
349 $unproc .= $str;
350 $result .= normalize_partial($form, $unproc);
351 }
352 $result .= $unproc;
353 # instead of normalize($form, join('', @string))
354
355=item C<$processed = NFD_partial($unprocessed)>
356
357It does like C<normalize_partial('NFD', $unprocessed)>.
358Note that C<$unprocessed> will be modified as a side-effect.
359
360=item C<$processed = NFC_partial($unprocessed)>
361
362It does like C<normalize_partial('NFC', $unprocessed)>.
363Note that C<$unprocessed> will be modified as a side-effect.
364
365=item C<$processed = NFKD_partial($unprocessed)>
366
367It does like C<normalize_partial('NFKD', $unprocessed)>.
368Note that C<$unprocessed> will be modified as a side-effect.
369
370=item C<$processed = NFKC_partial($unprocessed)>
371
372It does like C<normalize_partial('NFKC', $unprocessed)>.
373Note that C<$unprocessed> will be modified as a side-effect.
374
8f118dcd
JH
375=back
376
377=head2 Quick Check
378
f7becd03 379(see Annex 8, UAX #15; and F<lib/unicore/DerivedNormalizationProps.txt>)
8f118dcd
JH
380
381The following functions check whether the string is in that normalization form.
382
fe067ad9 383The result returned will be one of the following:
8f118dcd
JH
384
385 YES The string is in that normalization form.
386 NO The string is not in that normalization form.
387 MAYBE Dubious. Maybe yes, maybe no.
388
389=over 4
390
391=item C<$result = checkNFD($string)>
392
fe067ad9 393It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
8f118dcd
JH
394
395=item C<$result = checkNFC($string)>
396
fe067ad9 397It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 398C<undef> if C<MAYBE>.
8f118dcd
JH
399
400=item C<$result = checkNFKD($string)>
401
fe067ad9 402It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
8f118dcd
JH
403
404=item C<$result = checkNFKC($string)>
405
fe067ad9 406It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 407C<undef> if C<MAYBE>.
8f118dcd 408
82e740b6
NC
409=item C<$result = checkFCD($string)>
410
fe067ad9 411It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
82e740b6
NC
412
413=item C<$result = checkFCC($string)>
414
fe067ad9 415It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 416C<undef> if C<MAYBE>.
82e740b6 417
fe067ad9 418Note: If a string is not in FCD, it must not be in FCC.
82e740b6
NC
419So C<checkFCC($not_FCD_string)> should return C<NO>.
420
8f118dcd
JH
421=item C<$result = check($form_name, $string)>
422
fe067ad9 423It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 424C<undef> if C<MAYBE>.
8f118dcd 425
628bbff0
RGS
426As C<$form_name>, one of the following names must be given.
427
428 'C' or 'NFC' for Normalization Form C (UAX #15)
429 'D' or 'NFD' for Normalization Form D (UAX #15)
430 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
431 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
432
433 'FCD' for "Fast C or D" Form (UTN #5)
434 'FCC' for "Fast C Contiguous" (UTN #5)
8f118dcd
JH
435
436=back
437
438B<Note>
439
82e740b6
NC
440In the cases of NFD, NFKD, and FCD, the answer must be
441either C<YES> or C<NO>. The answer C<MAYBE> may be returned
442in the cases of NFC, NFKC, and FCC.
8f118dcd 443
82e740b6
NC
444A C<MAYBE> string should contain at least one combining character
445or the like. For example, C<COMBINING ACUTE ACCENT> has
8f118dcd 446the MAYBE_NFC/MAYBE_NFKC property.
82e740b6 447
8f118dcd
JH
448Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
449and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502 450C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd
JH
451(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
452while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
453
628bbff0
RGS
454If you want to check exactly, compare the string with its NFC/NFKC/FCC.
455
456 if ($string eq NFC($string)) {
a96160d6 457 # $string is exactly normalized in NFC;
628bbff0 458 } else {
a96160d6 459 # $string is not normalized in NFC;
628bbff0 460 }
8f118dcd 461
628bbff0 462 if ($string eq NFKC($string)) {
a96160d6 463 # $string is exactly normalized in NFKC;
628bbff0 464 } else {
a96160d6 465 # $string is not normalized in NFKC;
628bbff0 466 }
8f118dcd 467
2a204b45
JH
468=head2 Character Data
469
470These functions are interface of character data used internally.
d0ed0342
BG
471If you want only to get Unicode normalization forms, you don't need
472call them yourself.
2a204b45
JH
473
474=over 4
475
fe067ad9 476=item C<$canonical_decomposition = getCanon($code_point)>
2a204b45 477
fe067ad9
SP
478If the character is canonically decomposable (including Hangul Syllables),
479it returns the (full) canonical decomposition as a string.
480Otherwise it returns C<undef>.
8f118dcd 481
fe067ad9
SP
482B<Note:> According to the Unicode standard, the canonical decomposition
483of the character that is not canonically decomposable is same as
484the character itself.
8f118dcd 485
fe067ad9 486=item C<$compatibility_decomposition = getCompat($code_point)>
2a204b45 487
fe067ad9
SP
488If the character is compatibility decomposable (including Hangul Syllables),
489it returns the (full) compatibility decomposition as a string.
490Otherwise it returns C<undef>.
2a204b45 491
fe067ad9
SP
492B<Note:> According to the Unicode standard, the compatibility decomposition
493of the character that is not compatibility decomposable is same as
494the character itself.
2a204b45 495
fe067ad9 496=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
2a204b45 497
fe067ad9 498If two characters here and next (as code points) are composable
8f118dcd 499(including Hangul Jamo/Syllables and Composition Exclusions),
fe067ad9
SP
500it returns the code point of the composite.
501
502If they are not composable, it returns C<undef>.
2a204b45 503
fe067ad9 504=item C<$combining_class = getCombinClass($code_point)>
2a204b45 505
fe067ad9 506It returns the combining class (as an integer) of the character.
2a204b45 507
fe067ad9 508=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
2a204b45 509
fe067ad9
SP
510It returns a boolean whether the character of the specified codepoint
511may be composed with the previous one in a certain composition
512(including Hangul Compositions, but excluding
513Composition Exclusions and Non-Starter Decompositions).
2a204b45 514
fe067ad9 515=item C<$is_exclusion = isExclusion($code_point)>
8f118dcd 516
fe067ad9 517It returns a boolean whether the code point is a composition exclusion.
8f118dcd 518
fe067ad9 519=item C<$is_singleton = isSingleton($code_point)>
8f118dcd 520
fe067ad9 521It returns a boolean whether the code point is a singleton
8f118dcd 522
fe067ad9 523=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
8f118dcd 524
fe067ad9 525It returns a boolean whether the code point has Non-Starter Decomposition.
8f118dcd 526
fe067ad9
SP
527=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
528
529It returns a boolean of the derived property Comp_Ex
530(Full_Composition_Exclusion). This property is generated from
531Composition Exclusions + Singletons + Non-Starter Decompositions.
532
533=item C<$NFD_is_NO = isNFD_NO($code_point)>
534
535It returns a boolean of the derived property NFD_NO
536(NFD_Quick_Check=No).
537
538=item C<$NFC_is_NO = isNFC_NO($code_point)>
539
540It returns a boolean of the derived property NFC_NO
541(NFC_Quick_Check=No).
542
543=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
544
545It returns a boolean of the derived property NFC_MAYBE
546(NFC_Quick_Check=Maybe).
547
548=item C<$NFKD_is_NO = isNFKD_NO($code_point)>
549
550It returns a boolean of the derived property NFKD_NO
551(NFKD_Quick_Check=No).
552
553=item C<$NFKC_is_NO = isNFKC_NO($code_point)>
554
555It returns a boolean of the derived property NFKC_NO
556(NFKC_Quick_Check=No).
557
558=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
559
560It returns a boolean of the derived property NFKC_MAYBE
561(NFKC_Quick_Check=Maybe).
2a204b45
JH
562
563=back
564
628bbff0 565=head1 EXPORT
2a204b45
JH
566
567C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
568
569C<normalize> and other some functions: on request.
570
628bbff0
RGS
571=head1 CAVEATS
572
573=over 4
574
575=item Perl's version vs. Unicode version
576
577Since this module refers to perl core's Unicode database in the directory
578F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
3ece07bc
SH
579normalization implemented by this module depends on what has been
580compiled into your perl. The following table lists the default Unicode
581version that comes with various perl versions. (It is possible to change
582the Unicode version in any perl version to be any earlier Unicode version,
583so one could cause Unicode 3.2 to be used in any perl version starting with
3baae3fa 5845.8.0. Read F<C<$Config{privlib}>/unicore/README.perl> for details.
628bbff0 585
fe067ad9
SP
586 perl's version implemented Unicode version
587 5.6.1 3.0.1
588 5.7.2 3.1.0
589 5.7.3 3.1.1 (normalization is same as 3.1.0)
590 5.8.0 3.2.0
3ece07bc
SH
591 5.8.1-5.8.3 4.0.0
592 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0)
593 5.8.7-5.8.8 4.1.0
51683ce6 594 5.10.0 5.0.0
3ece07bc 595 5.8.9, 5.10.1 5.1.0
65e3945f 596 5.12.x 5.2.0
a257baa9
CBW
597 5.14.x 6.0.0
598 5.16.x 6.1.0
65e3945f 599 5.18.x 6.2.0
3ece07bc
SH
600 5.20.x 6.3.0
601 5.22.x 7.0.0
628bbff0
RGS
602
603=item Correction of decomposition mapping
604
605In older Unicode versions, a small number of characters (all of which are
606CJK compatibility ideographs as far as they have been found) may have
f7becd03
KW
607an erroneous decomposition mapping (see
608F<lib/unicore/NormalizationCorrections.txt>).
609Anyhow, this module will neither refer to
610F<lib/unicore/NormalizationCorrections.txt>
628bbff0
RGS
611nor provide any specific version of normalization. Therefore this module
612running on an older perl with an older Unicode database may use
613the erroneous decomposition mapping blindly conforming to the Unicode database.
614
615=item Revised definition of canonical composition
616
617In Unicode 4.1.0, the definition D2 of canonical composition (which
618affects NFC and NFKC) has been changed (see Public Review Issue #29
619and recent UAX #15). This module has used the newer definition
620since the version 0.07 (Oct 31, 2001).
2b8d773d 621This module will not support the normalization according to the older
628bbff0
RGS
622definition, even if the Unicode version implemented by perl is
623lower than 4.1.0.
624
625=back
626
2a204b45
JH
627=head1 AUTHOR
628
a092bcfd 629SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
2a204b45 630
c6b7cc21
SH
631Currently maintained by <perl5-porters@perl.org>
632
b7657688 633Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45 634
662aea32
CBW
635=head1 LICENSE
636
628bbff0
RGS
637This module is free software; you can redistribute it
638and/or modify it under the same terms as Perl itself.
2a204b45
JH
639
640=head1 SEE ALSO
641
642=over 4
643
1a2bd084 644=item L<http://www.unicode.org/reports/tr15/>
2a204b45
JH
645
646Unicode Normalization Forms - UAX #15
647
1a2bd084 648=item L<http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt>
fe067ad9
SP
649
650Composition Exclusion Table
651
1a2bd084 652=item L<http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt>
8f118dcd
JH
653
654Derived Normalization Properties
655
1a2bd084 656=item L<http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt>
628bbff0
RGS
657
658Normalization Corrections
659
1a2bd084 660=item L<http://www.unicode.org/review/pr-29.html>
628bbff0
RGS
661
662Public Review Issue #29: Normalization Issue
663
1a2bd084 664=item L<http://www.unicode.org/notes/tn5/>
82e740b6
NC
665
666Canonical Equivalence in Applications - UTN #5
667
2a204b45
JH
668=back
669
670=cut