dist/Unicode-Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 use 5.006;
   4 use strict;
   5 use warnings;
   6 use Carp;
   7
   8 no warnings 'utf8';
   9
  10 our $VERSION = '1.32';
  11 our $PACKAGE = __PACKAGE__;
  12
  13 our @EXPORT = qw( NFC NFD NFKC NFKD );
  14 our @EXPORT_OK = qw(
  15     normalize decompose reorder compose
  16     checkNFD checkNFKD checkNFC checkNFKC check
  17     getCanon getCompat getComposite getCombinClass
  18     isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
  19     isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
  20     FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter
  21     normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial
  22 );
  23 our %EXPORT_TAGS = (
  24     all       => [ @EXPORT, @EXPORT_OK ],
  25     normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
  26     check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  27     fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
  28 );
  29
  30 ##
  31 ## utilities for tests
  32 ##
  33
  34                              # No EBCDIC support on early perls
  35 *to_native = ($::IS_ASCII || $] < 5.008)
  36              ? sub { return shift }
  37              : sub { utf8::unicode_to_native(shift) };
  38
  39 *from_native = ($::IS_ASCII || $] < 5.008)
  40              ? sub { return shift }
  41              : sub { utf8::native_to_unicode(shift) };
  42
  43 # The .t files are all in terms of Unicode, so xlate to/from native
  44 sub dot_t_pack_U {
  45     return pack('U*', map { to_native($_) } @_);
  46 }
  47
  48 sub dot_t_unpack_U {
  49
  50     # The empty pack returns an empty UTF-8 string, so the effect is to force
  51     # the shifted parameter into being UTF-8.  This allows this to work on
  52     # Perl 5.6, where there is no utf8::upgrade().
  53     return map { from_native($_) } unpack('U*', shift(@_).pack('U*'));
  54 }
  55
  56 sub get_printable_string ($) {
  57     use bytes;
  58     my $s = shift;
  59
  60     # DeMorgan's laws cause this to mean ascii printables
  61     return $s if $s =~ /[^[:^ascii:][:^print:]]/;
  62
  63     return join " ", map { sprintf "\\x%02x", ord $_ } split "", $s;
  64 }
  65
  66 sub ok ($$;$) {
  67     my $count_ref = shift;  # Test number in caller
  68     my $p = my $r = shift;
  69     my $x;
  70     if (@_) {
  71         $x = shift;
  72         $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x;
  73     }
  74
  75     print $p ? "ok" : "not ok", ' ', ++$$count_ref, "\n";
  76
  77     return if $p;
  78
  79     my (undef, $file, $line) = caller(1);
  80     print STDERR "# Failed test $$count_ref at $file line $line\n";
  81
  82     return unless defined $x;
  83
  84     print STDERR "#      got ", get_printable_string($r), "\n";
  85     print STDERR "# expected ", get_printable_string($x), "\n";
  86 }
  87
  88 require Exporter;
  89
  90 ##### The above part is common to XS and PP #####
  91
  92 our @ISA = qw(Exporter);
  93 use XSLoader ();
  94 XSLoader::load( 'Unicode::Normalize', $VERSION );
  95
  96 ##### The below part is common to XS and PP #####
  97
  98 ##
  99 ## normalize
 100 ##
 101
 102 sub FCD ($) {
 103     my $str = shift;
 104     return checkFCD($str) ? $str : NFD($str);
 105 }
 106
 107 our %formNorm = (
 108     NFC  => \&NFC,      C  => \&NFC,
 109     NFD  => \&NFD,      D  => \&NFD,
 110     NFKC => \&NFKC,     KC => \&NFKC,
 111     NFKD => \&NFKD,     KD => \&NFKD,
 112     FCD  => \&FCD,      FCC => \&FCC,
 113 );
 114
 115 sub normalize($$)
 116 {
 117     my $form = shift;
 118     my $str = shift;
 119     if (exists $formNorm{$form}) {
 120         return $formNorm{$form}->($str);
 121     }
 122     croak($PACKAGE."::normalize: invalid form name: $form");
 123 }
 124
 125 ##
 126 ## partial
 127 ##
 128
 129 sub normalize_partial ($$) {
 130     if (exists $formNorm{$_[0]}) {
 131         my $n = normalize($_[0], $_[1]);
 132         my($p, $u) = splitOnLastStarter($n);
 133         $_[1] = $u;
 134         return $p;
 135     }
 136     croak($PACKAGE."::normalize_partial: invalid form name: $_[0]");
 137 }
 138
 139 sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) }
 140 sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) }
 141 sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) }
 142 sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) }
 143
 144 ##
 145 ## check
 146 ##
 147
 148 our %formCheck = (
 149     NFC  => \&checkNFC,         C  => \&checkNFC,
 150     NFD  => \&checkNFD,         D  => \&checkNFD,
 151     NFKC => \&checkNFKC,        KC => \&checkNFKC,
 152     NFKD => \&checkNFKD,        KD => \&checkNFKD,
 153     FCD  => \&checkFCD,         FCC => \&checkFCC,
 154 );
 155
 156 sub check($$)
 157 {
 158     my $form = shift;
 159     my $str = shift;
 160     if (exists $formCheck{$form}) {
 161         return $formCheck{$form}->($str);
 162     }
 163     croak($PACKAGE."::check: invalid form name: $form");
 164 }
 165
 166 1;
 167 __END__
 168
 169 =head1 NAME
 170
 171 Unicode::Normalize - Unicode Normalization Forms
 172
 173 =head1 SYNOPSIS
 174
 175 (1) using function names exported by default:
 176
 177   use Unicode::Normalize;
 178
 179   $NFD_string  = NFD($string);  # Normalization Form D
 180   $NFC_string  = NFC($string);  # Normalization Form C
 181   $NFKD_string = NFKD($string); # Normalization Form KD
 182   $NFKC_string = NFKC($string); # Normalization Form KC
 183
 184 (2) using function names exported on request:
 185
 186   use Unicode::Normalize 'normalize';
 187
 188   $NFD_string  = normalize('D',  $string);  # Normalization Form D
 189   $NFC_string  = normalize('C',  $string);  # Normalization Form C
 190   $NFKD_string = normalize('KD', $string);  # Normalization Form KD
 191   $NFKC_string = normalize('KC', $string);  # Normalization Form KC
 192
 193 =head1 DESCRIPTION
 194
 195 Parameters:
 196
 197 C<$string> is used as a string under character semantics (see L<perlunicode>).
 198
 199 C<$code_point> should be an unsigned integer representing a Unicode code point.
 200
 201 Note: Between XSUB and pure Perl, there is an incompatibility
 202 about the interpretation of C<$code_point> as a decimal number.
 203 XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
 204 Do not use a floating point nor a negative sign in C<$code_point>.
 205
 206 =head2 Normalization Forms
 207
 208 =over 4
 209
 210 =item C<$NFD_string = NFD($string)>
 211
 212 It returns the Normalization Form D (formed by canonical decomposition).
 213
 214 =item C<$NFC_string = NFC($string)>
 215
 216 It returns the Normalization Form C (formed by canonical decomposition
 217 followed by canonical composition).
 218
 219 =item C<$NFKD_string = NFKD($string)>
 220
 221 It returns the Normalization Form KD (formed by compatibility decomposition).
 222
 223 =item C<$NFKC_string = NFKC($string)>
 224
 225 It returns the Normalization Form KC (formed by compatibility decomposition
 226 followed by B<canonical> composition).
 227
 228 =item C<$FCD_string = FCD($string)>
 229
 230 If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
 231 it returns the string without modification; otherwise it returns an FCD string.
 232
 233 Note: FCD is not always unique, then plural forms may be equivalent
 234 each other. C<FCD()> will return one of these equivalent forms.
 235
 236 =item C<$FCC_string = FCC($string)>
 237
 238 It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
 239
 240 Note: FCC is unique, as well as four normalization forms (NF*).
 241
 242 =item C<$normalized_string = normalize($form_name, $string)>
 243
 244 It returns the normalization form of C<$form_name>.
 245
 246 As C<$form_name>, one of the following names must be given.
 247
 248   'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 249   'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 250   'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 251   'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 252
 253   'FCD'          for "Fast C or D" Form  (UTN #5)
 254   'FCC'          for "Fast C Contiguous" (UTN #5)
 255
 256 =back
 257
 258 =head2 Decomposition and Composition
 259
 260 =over 4
 261
 262 =item C<$decomposed_string = decompose($string [, $useCompatMapping])>
 263
 264 It returns the concatenation of the decomposition of each character
 265 in the string.
 266
 267 If the second parameter (a boolean) is omitted or false,
 268 the decomposition is canonical decomposition;
 269 if the second parameter (a boolean) is true,
 270 the decomposition is compatibility decomposition.
 271
 272 The string returned is not always in NFD/NFKD. Reordering may be required.
 273
 274  $NFD_string  = reorder(decompose($string));       # eq. to NFD()
 275  $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
 276
 277 =item C<$reordered_string = reorder($string)>
 278
 279 It returns the result of reordering the combining characters
 280 according to Canonical Ordering Behavior.
 281
 282 For example, when you have a list of NFD/NFKD strings,
 283 you can get the concatenated NFD/NFKD string from them, by saying
 284
 285     $concat_NFD  = reorder(join '', @NFD_strings);
 286     $concat_NFKD = reorder(join '', @NFKD_strings);
 287
 288 =item C<$composed_string = compose($string)>
 289
 290 It returns the result of canonical composition
 291 without applying any decomposition.
 292
 293 For example, when you have a NFD/NFKD string,
 294 you can get its NFC/NFKC string, by saying
 295
 296     $NFC_string  = compose($NFD_string);
 297     $NFKC_string = compose($NFKD_string);
 298
 299 =item C<($processed, $unprocessed) = splitOnLastStarter($normalized)>
 300
 301 It returns two strings: the first one, C<$processed>, is a part
 302 before the last starter, and the second one, C<$unprocessed> is
 303 another part after the first part. A starter is a character having
 304 a combining class of zero (see UAX #15).
 305
 306 Note that C<$processed> may be empty (when C<$normalized> contains no
 307 starter or starts with the last starter), and then C<$unprocessed>
 308 should be equal to the entire C<$normalized>.
 309
 310 When you have a C<$normalized> string and an C<$unnormalized> string
 311 following it, a simple concatenation is wrong:
 312
 313  $concat = $normalized . normalize($form, $unnormalized); # wrong!
 314
 315 Instead of it, do like this:
 316
 317  ($processed, $unprocessed) = splitOnLastStarter($normalized);
 318  $concat = $processed . normalize($form,$unprocessed.$unnormalized);
 319
 320 C<splitOnLastStarter()> should be called with a pre-normalized parameter
 321 C<$normalized>, that is in the same form as C<$form> you want.
 322
 323 If you have an array of C<@string> that should be concatenated and then
 324 normalized, you can do like this:
 325
 326     my $result = "";
 327     my $unproc = "";
 328     foreach my $str (@string) {
 329         $unproc .= $str;
 330         my $n = normalize($form, $unproc);
 331         my($p, $u) = splitOnLastStarter($n);
 332         $result .= $p;
 333         $unproc  = $u;
 334     }
 335     $result .= $unproc;
 336     # instead of normalize($form, join('', @string))
 337
 338 =item C<$processed = normalize_partial($form, $unprocessed)>
 339
 340 A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>.
 341 Note that C<$unprocessed> will be modified as a side-effect.
 342
 343 If you have an array of C<@string> that should be concatenated and then
 344 normalized, you can do like this:
 345
 346     my $result = "";
 347     my $unproc = "";
 348     foreach my $str (@string) {
 349         $unproc .= $str;
 350         $result .= normalize_partial($form, $unproc);
 351     }
 352     $result .= $unproc;
 353     # instead of normalize($form, join('', @string))
 354
 355 =item C<$processed = NFD_partial($unprocessed)>
 356
 357 It does like C<normalize_partial('NFD', $unprocessed)>.
 358 Note that C<$unprocessed> will be modified as a side-effect.
 359
 360 =item C<$processed = NFC_partial($unprocessed)>
 361
 362 It does like C<normalize_partial('NFC', $unprocessed)>.
 363 Note that C<$unprocessed> will be modified as a side-effect.
 364
 365 =item C<$processed = NFKD_partial($unprocessed)>
 366
 367 It does like C<normalize_partial('NFKD', $unprocessed)>.
 368 Note that C<$unprocessed> will be modified as a side-effect.
 369
 370 =item C<$processed = NFKC_partial($unprocessed)>
 371
 372 It does like C<normalize_partial('NFKC', $unprocessed)>.
 373 Note that C<$unprocessed> will be modified as a side-effect.
 374
 375 =back
 376
 377 =head2 Quick Check
 378
 379 (see Annex 8, UAX #15; and F<lib/unicore/DerivedNormalizationProps.txt>)
 380
 381 The following functions check whether the string is in that normalization form.
 382
 383 The result returned will be one of the following:
 384
 385     YES     The string is in that normalization form.
 386     NO      The string is not in that normalization form.
 387     MAYBE   Dubious. Maybe yes, maybe no.
 388
 389 =over 4
 390
 391 =item C<$result = checkNFD($string)>
 392
 393 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 394
 395 =item C<$result = checkNFC($string)>
 396
 397 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 398 C<undef> if C<MAYBE>.
 399
 400 =item C<$result = checkNFKD($string)>
 401
 402 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 403
 404 =item C<$result = checkNFKC($string)>
 405
 406 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 407 C<undef> if C<MAYBE>.
 408
 409 =item C<$result = checkFCD($string)>
 410
 411 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 412
 413 =item C<$result = checkFCC($string)>
 414
 415 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 416 C<undef> if C<MAYBE>.
 417
 418 Note: If a string is not in FCD, it must not be in FCC.
 419 So C<checkFCC($not_FCD_string)> should return C<NO>.
 420
 421 =item C<$result = check($form_name, $string)>
 422
 423 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 424 C<undef> if C<MAYBE>.
 425
 426 As C<$form_name>, one of the following names must be given.
 427
 428   'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 429   'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 430   'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 431   'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 432
 433   'FCD'          for "Fast C or D" Form  (UTN #5)
 434   'FCC'          for "Fast C Contiguous" (UTN #5)
 435
 436 =back
 437
 438 B<Note>
 439
 440 In the cases of NFD, NFKD, and FCD, the answer must be
 441 either C<YES> or C<NO>. The answer C<MAYBE> may be returned
 442 in the cases of NFC, NFKC, and FCC.
 443
 444 A C<MAYBE> string should contain at least one combining character
 445 or the like. For example, C<COMBINING ACUTE ACCENT> has
 446 the MAYBE_NFC/MAYBE_NFKC property.
 447
 448 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
 449 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
 450 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
 451 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
 452 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
 453
 454 If you want to check exactly, compare the string with its NFC/NFKC/FCC.
 455
 456     if ($string eq NFC($string)) {
 457         # $string is exactly normalized in NFC;
 458     } else {
 459         # $string is not normalized in NFC;
 460     }
 461
 462     if ($string eq NFKC($string)) {
 463         # $string is exactly normalized in NFKC;
 464     } else {
 465         # $string is not normalized in NFKC;
 466     }
 467
 468 =head2 Character Data
 469
 470 These functions are interface of character data used internally.
 471 If you want only to get Unicode normalization forms, you don't need
 472 call them yourself.
 473
 474 =over 4
 475
 476 =item C<$canonical_decomposition = getCanon($code_point)>
 477
 478 If the character is canonically decomposable (including Hangul Syllables),
 479 it returns the (full) canonical decomposition as a string.
 480 Otherwise it returns C<undef>.
 481
 482 B<Note:> According to the Unicode standard, the canonical decomposition
 483 of the character that is not canonically decomposable is same as
 484 the character itself.
 485
 486 =item C<$compatibility_decomposition = getCompat($code_point)>
 487
 488 If the character is compatibility decomposable (including Hangul Syllables),
 489 it returns the (full) compatibility decomposition as a string.
 490 Otherwise it returns C<undef>.
 491
 492 B<Note:> According to the Unicode standard, the compatibility decomposition
 493 of the character that is not compatibility decomposable is same as
 494 the character itself.
 495
 496 =item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
 497
 498 If two characters here and next (as code points) are composable
 499 (including Hangul Jamo/Syllables and Composition Exclusions),
 500 it returns the code point of the composite.
 501
 502 If they are not composable, it returns C<undef>.
 503
 504 =item C<$combining_class = getCombinClass($code_point)>
 505
 506 It returns the combining class (as an integer) of the character.
 507
 508 =item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
 509
 510 It returns a boolean whether the character of the specified codepoint
 511 may be composed with the previous one in a certain composition
 512 (including Hangul Compositions, but excluding
 513 Composition Exclusions and Non-Starter Decompositions).
 514
 515 =item C<$is_exclusion = isExclusion($code_point)>
 516
 517 It returns a boolean whether the code point is a composition exclusion.
 518
 519 =item C<$is_singleton = isSingleton($code_point)>
 520
 521 It returns a boolean whether the code point is a singleton
 522
 523 =item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
 524
 525 It returns a boolean whether the code point has Non-Starter Decomposition.
 526
 527 =item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
 528
 529 It returns a boolean of the derived property Comp_Ex
 530 (Full_Composition_Exclusion). This property is generated from
 531 Composition Exclusions + Singletons + Non-Starter Decompositions.
 532
 533 =item C<$NFD_is_NO = isNFD_NO($code_point)>
 534
 535 It returns a boolean of the derived property NFD_NO
 536 (NFD_Quick_Check=No).
 537
 538 =item C<$NFC_is_NO = isNFC_NO($code_point)>
 539
 540 It returns a boolean of the derived property NFC_NO
 541 (NFC_Quick_Check=No).
 542
 543 =item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
 544
 545 It returns a boolean of the derived property NFC_MAYBE
 546 (NFC_Quick_Check=Maybe).
 547
 548 =item C<$NFKD_is_NO = isNFKD_NO($code_point)>
 549
 550 It returns a boolean of the derived property NFKD_NO
 551 (NFKD_Quick_Check=No).
 552
 553 =item C<$NFKC_is_NO = isNFKC_NO($code_point)>
 554
 555 It returns a boolean of the derived property NFKC_NO
 556 (NFKC_Quick_Check=No).
 557
 558 =item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
 559
 560 It returns a boolean of the derived property NFKC_MAYBE
 561 (NFKC_Quick_Check=Maybe).
 562
 563 =back
 564
 565 =head1 EXPORT
 566
 567 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 568
 569 C<normalize> and other some functions: on request.
 570
 571 =head1 CAVEATS
 572
 573 =over 4
 574
 575 =item Perl's version vs. Unicode version
 576
 577 Since this module refers to perl core's Unicode database in the directory
 578 F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
 579 normalization implemented by this module depends on what has been
 580 compiled into your perl.  The following table lists the default Unicode
 581 version that comes with various perl versions.  (It is possible to change
 582 the Unicode version in any perl version to be any earlier Unicode version,
 583 so one could cause Unicode 3.2 to be used in any perl version starting with
 584 5.8.0.  Read F<C<$Config{privlib}>/unicore/README.perl> for details.
 585
 586     perl's version     implemented Unicode version
 587        5.6.1              3.0.1
 588        5.7.2              3.1.0
 589        5.7.3              3.1.1 (normalization is same as 3.1.0)
 590        5.8.0              3.2.0
 591          5.8.1-5.8.3      4.0.0
 592          5.8.4-5.8.6      4.0.1 (normalization is same as 4.0.0)
 593          5.8.7-5.8.8      4.1.0
 594        5.10.0             5.0.0
 595         5.8.9, 5.10.1     5.1.0
 596        5.12.x             5.2.0
 597        5.14.x             6.0.0
 598        5.16.x             6.1.0
 599        5.18.x             6.2.0
 600        5.20.x             6.3.0
 601        5.22.x             7.0.0
 602
 603 =item Correction of decomposition mapping
 604
 605 In older Unicode versions, a small number of characters (all of which are
 606 CJK compatibility ideographs as far as they have been found) may have
 607 an erroneous decomposition mapping (see
 608 F<lib/unicore/NormalizationCorrections.txt>).
 609 Anyhow, this module will neither refer to
 610 F<lib/unicore/NormalizationCorrections.txt>
 611 nor provide any specific version of normalization. Therefore this module
 612 running on an older perl with an older Unicode database may use
 613 the erroneous decomposition mapping blindly conforming to the Unicode database.
 614
 615 =item Revised definition of canonical composition
 616
 617 In Unicode 4.1.0, the definition D2 of canonical composition (which
 618 affects NFC and NFKC) has been changed (see Public Review Issue #29
 619 and recent UAX #15). This module has used the newer definition
 620 since the version 0.07 (Oct 31, 2001).
 621 This module will not support the normalization according to the older
 622 definition, even if the Unicode version implemented by perl is
 623 lower than 4.1.0.
 624
 625 =back
 626
 627 =head1 AUTHOR
 628
 629 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
 630
 631 Currently maintained by <perl5-porters@perl.org>
 632
 633 Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved.
 634
 635 =head1 LICENSE
 636
 637 This module is free software; you can redistribute it
 638 and/or modify it under the same terms as Perl itself.
 639
 640 =head1 SEE ALSO
 641
 642 =over 4
 643
 644 =item L<http://www.unicode.org/reports/tr15/>
 645
 646 Unicode Normalization Forms - UAX #15
 647
 648 =item L<http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt>
 649
 650 Composition Exclusion Table
 651
 652 =item L<http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt>
 653
 654 Derived Normalization Properties
 655
 656 =item L<http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt>
 657
 658 Normalization Corrections
 659
 660 =item L<http://www.unicode.org/review/pr-29.html>
 661
 662 Public Review Issue #29: Normalization Issue
 663
 664 =item L<http://www.unicode.org/notes/tn5/>
 665
 666 Canonical Equivalence in Applications - UTN #5
 667
 668 =back
 669
 670 =cut