cpan/Unicode-Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41)) {
   5         die "Unicode::Normalize cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13
  14 no warnings 'utf8';
  15
  16 our $VERSION = '1.06';
  17 our $PACKAGE = __PACKAGE__;
  18
  19 our @EXPORT = qw( NFC NFD NFKC NFKD );
  20 our @EXPORT_OK = qw(
  21     normalize decompose reorder compose
  22     checkNFD checkNFKD checkNFC checkNFKC check
  23     getCanon getCompat getComposite getCombinClass
  24     isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
  25     isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
  26     FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter
  27     normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial
  28 );
  29 our %EXPORT_TAGS = (
  30     all       => [ @EXPORT, @EXPORT_OK ],
  31     normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
  32     check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  33     fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
  34 );
  35
  36 ##
  37 ## utilites for tests
  38 ##
  39
  40 sub pack_U {
  41     return pack('U*', @_);
  42 }
  43
  44 sub unpack_U {
  45     return unpack('U*', shift(@_).pack('U*'));
  46 }
  47
  48 require Exporter;
  49
  50 ##### The above part is common to XS and PP #####
  51
  52 our @ISA = qw(Exporter DynaLoader);
  53 require DynaLoader;
  54 bootstrap Unicode::Normalize $VERSION;
  55
  56 ##### The below part is common to XS and PP #####
  57
  58 ##
  59 ## normalize
  60 ##
  61
  62 sub FCD ($) {
  63     my $str = shift;
  64     return checkFCD($str) ? $str : NFD($str);
  65 }
  66
  67 our %formNorm = (
  68     NFC  => \&NFC,      C  => \&NFC,
  69     NFD  => \&NFD,      D  => \&NFD,
  70     NFKC => \&NFKC,     KC => \&NFKC,
  71     NFKD => \&NFKD,     KD => \&NFKD,
  72     FCD  => \&FCD,      FCC => \&FCC,
  73 );
  74
  75 sub normalize($$)
  76 {
  77     my $form = shift;
  78     my $str = shift;
  79     if (exists $formNorm{$form}) {
  80         return $formNorm{$form}->($str);
  81     }
  82     croak($PACKAGE."::normalize: invalid form name: $form");
  83 }
  84
  85 ##
  86 ## partial
  87 ##
  88
  89 sub normalize_partial ($$) {
  90     if (exists $formNorm{$_[0]}) {
  91         my $n = normalize($_[0], $_[1]);
  92         my($p, $u) = splitOnLastStarter($n);
  93         $_[1] = $u;
  94         return $p;
  95     }
  96     croak($PACKAGE."::normalize_partial: invalid form name: $_[0]");
  97 }
  98
  99 sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) }
 100 sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) }
 101 sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) }
 102 sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) }
 103
 104 ##
 105 ## check
 106 ##
 107
 108 our %formCheck = (
 109     NFC  => \&checkNFC,         C  => \&checkNFC,
 110     NFD  => \&checkNFD,         D  => \&checkNFD,
 111     NFKC => \&checkNFKC,        KC => \&checkNFKC,
 112     NFKD => \&checkNFKD,        KD => \&checkNFKD,
 113     FCD  => \&checkFCD,         FCC => \&checkFCC,
 114 );
 115
 116 sub check($$)
 117 {
 118     my $form = shift;
 119     my $str = shift;
 120     if (exists $formCheck{$form}) {
 121         return $formCheck{$form}->($str);
 122     }
 123     croak($PACKAGE."::check: invalid form name: $form");
 124 }
 125
 126 1;
 127 __END__
 128
 129 =head1 NAME
 130
 131 Unicode::Normalize - Unicode Normalization Forms
 132
 133 =head1 SYNOPSIS
 134
 135 (1) using function names exported by default:
 136
 137   use Unicode::Normalize;
 138
 139   $NFD_string  = NFD($string);  # Normalization Form D
 140   $NFC_string  = NFC($string);  # Normalization Form C
 141   $NFKD_string = NFKD($string); # Normalization Form KD
 142   $NFKC_string = NFKC($string); # Normalization Form KC
 143
 144 (2) using function names exported on request:
 145
 146   use Unicode::Normalize 'normalize';
 147
 148   $NFD_string  = normalize('D',  $string);  # Normalization Form D
 149   $NFC_string  = normalize('C',  $string);  # Normalization Form C
 150   $NFKD_string = normalize('KD', $string);  # Normalization Form KD
 151   $NFKC_string = normalize('KC', $string);  # Normalization Form KC
 152
 153 =head1 DESCRIPTION
 154
 155 Parameters:
 156
 157 C<$string> is used as a string under character semantics (see F<perlunicode>).
 158
 159 C<$code_point> should be an unsigned integer representing a Unicode code point.
 160
 161 Note: Between XSUB and pure Perl, there is an incompatibility
 162 about the interpretation of C<$code_point> as a decimal number.
 163 XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
 164 Do not use a floating point nor a negative sign in C<$code_point>.
 165
 166 =head2 Normalization Forms
 167
 168 =over 4
 169
 170 =item C<$NFD_string = NFD($string)>
 171
 172 It returns the Normalization Form D (formed by canonical decomposition).
 173
 174 =item C<$NFC_string = NFC($string)>
 175
 176 It returns the Normalization Form C (formed by canonical decomposition
 177 followed by canonical composition).
 178
 179 =item C<$NFKD_string = NFKD($string)>
 180
 181 It returns the Normalization Form KD (formed by compatibility decomposition).
 182
 183 =item C<$NFKC_string = NFKC($string)>
 184
 185 It returns the Normalization Form KC (formed by compatibility decomposition
 186 followed by B<canonical> composition).
 187
 188 =item C<$FCD_string = FCD($string)>
 189
 190 If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
 191 it returns the string without modification; otherwise it returns an FCD string.
 192
 193 Note: FCD is not always unique, then plural forms may be equivalent
 194 each other. C<FCD()> will return one of these equivalent forms.
 195
 196 =item C<$FCC_string = FCC($string)>
 197
 198 It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
 199
 200 Note: FCC is unique, as well as four normalization forms (NF*).
 201
 202 =item C<$normalized_string = normalize($form_name, $string)>
 203
 204 It returns the normalization form of C<$form_name>.
 205
 206 As C<$form_name>, one of the following names must be given.
 207
 208   'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 209   'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 210   'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 211   'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 212
 213   'FCD'          for "Fast C or D" Form  (UTN #5)
 214   'FCC'          for "Fast C Contiguous" (UTN #5)
 215
 216 =back
 217
 218 =head2 Decomposition and Composition
 219
 220 =over 4
 221
 222 =item C<$decomposed_string = decompose($string [, $useCompatMapping])>
 223
 224 It returns the concatenation of the decomposition of each character
 225 in the string.
 226
 227 If the second parameter (a boolean) is omitted or false,
 228 the decomposition is canonical decomposition;
 229 if the second parameter (a boolean) is true,
 230 the decomposition is compatibility decomposition.
 231
 232 The string returned is not always in NFD/NFKD. Reordering may be required.
 233
 234     $NFD_string  = reorder(decompose($string));       # eq. to NFD()
 235     $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
 236
 237 =item C<$reordered_string = reorder($string)>
 238
 239 It returns the result of reordering the combining characters
 240 according to Canonical Ordering Behavior.
 241
 242 For example, when you have a list of NFD/NFKD strings,
 243 you can get the concatenated NFD/NFKD string from them, by saying
 244
 245     $concat_NFD  = reorder(join '', @NFD_strings);
 246     $concat_NFKD = reorder(join '', @NFKD_strings);
 247
 248 =item C<$composed_string = compose($string)>
 249
 250 It returns the result of canonical composition
 251 without applying any decomposition.
 252
 253 For example, when you have a NFD/NFKD string,
 254 you can get its NFC/NFKC string, by saying
 255
 256     $NFC_string  = compose($NFD_string);
 257     $NFKC_string = compose($NFKD_string);
 258
 259 =item C<($processed, $unprocessed) = splitOnLastStarter($normalized)>
 260
 261 It returns two strings: the first one, C<$processed>, is a part
 262 before the last starter, and the second one, C<$unprocessed> is
 263 another part after the first part. A starter is a character having
 264 a combining class of zero (see UAX #15).
 265
 266 Note that C<$processed> may be empty (when C<$normalized> contains no
 267 starter or starts with the last starter), and then C<$unprocessed>
 268 should be equal to the entire C<$normalized>.
 269
 270 When you have a C<$normalized> string and an C<$unnormalized> string
 271 following it, a simple concatenation is wrong:
 272
 273     $concat = $normalized . normalize($form, $unnormalized); # wrong!
 274
 275 Instead of it, do like this:
 276
 277     ($processed, $unprocessed) = splitOnLastStarter($normalized);
 278      $concat = $processed . normalize($form, $unprocessed.$unnormalized);
 279
 280 C<splitOnLastStarter()> should be called with a pre-normalized parameter
 281 C<$normalized>, that is in the same form as C<$form> you want.
 282
 283 If you have an array of C<@string> that should be concatenated and then
 284 normalized, you can do like this:
 285
 286     my $result = "";
 287     my $unproc = "";
 288     foreach my $str (@string) {
 289         $unproc .= $str;
 290         my $n = normalize($form, $unproc);
 291         my($p, $u) = splitOnLastStarter($n);
 292         $result .= $p;
 293         $unproc  = $u;
 294     }
 295     $result .= $unproc;
 296     # instead of normalize($form, join('', @string))
 297
 298 =item C<$processed = normalize_partial($form, $unprocessed)>
 299
 300 A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>.
 301 Note that C<$unprocessed> will be modified as a side-effect.
 302
 303 If you have an array of C<@string> that should be concatenated and then
 304 normalized, you can do like this:
 305
 306     my $result = "";
 307     my $unproc = "";
 308     foreach my $str (@string) {
 309         $unproc .= $str;
 310         $result .= normalize_partial($form, $unproc);
 311     }
 312     $result .= $unproc;
 313     # instead of normalize($form, join('', @string))
 314
 315 =item C<$processed = NFD_partial($unprocessed)>
 316
 317 It does like C<normalize_partial('NFD', $unprocessed)>.
 318 Note that C<$unprocessed> will be modified as a side-effect.
 319
 320 =item C<$processed = NFC_partial($unprocessed)>
 321
 322 It does like C<normalize_partial('NFC', $unprocessed)>.
 323 Note that C<$unprocessed> will be modified as a side-effect.
 324
 325 =item C<$processed = NFKD_partial($unprocessed)>
 326
 327 It does like C<normalize_partial('NFKD', $unprocessed)>.
 328 Note that C<$unprocessed> will be modified as a side-effect.
 329
 330 =item C<$processed = NFKC_partial($unprocessed)>
 331
 332 It does like C<normalize_partial('NFKC', $unprocessed)>.
 333 Note that C<$unprocessed> will be modified as a side-effect.
 334
 335 =back
 336
 337 =head2 Quick Check
 338
 339 (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
 340
 341 The following functions check whether the string is in that normalization form.
 342
 343 The result returned will be one of the following:
 344
 345     YES     The string is in that normalization form.
 346     NO      The string is not in that normalization form.
 347     MAYBE   Dubious. Maybe yes, maybe no.
 348
 349 =over 4
 350
 351 =item C<$result = checkNFD($string)>
 352
 353 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 354
 355 =item C<$result = checkNFC($string)>
 356
 357 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 358 C<undef> if C<MAYBE>.
 359
 360 =item C<$result = checkNFKD($string)>
 361
 362 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 363
 364 =item C<$result = checkNFKC($string)>
 365
 366 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 367 C<undef> if C<MAYBE>.
 368
 369 =item C<$result = checkFCD($string)>
 370
 371 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 372
 373 =item C<$result = checkFCC($string)>
 374
 375 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 376 C<undef> if C<MAYBE>.
 377
 378 Note: If a string is not in FCD, it must not be in FCC.
 379 So C<checkFCC($not_FCD_string)> should return C<NO>.
 380
 381 =item C<$result = check($form_name, $string)>
 382
 383 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 384 C<undef> if C<MAYBE>.
 385
 386 As C<$form_name>, one of the following names must be given.
 387
 388   'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 389   'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 390   'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 391   'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 392
 393   'FCD'          for "Fast C or D" Form  (UTN #5)
 394   'FCC'          for "Fast C Contiguous" (UTN #5)
 395
 396 =back
 397
 398 B<Note>
 399
 400 In the cases of NFD, NFKD, and FCD, the answer must be
 401 either C<YES> or C<NO>. The answer C<MAYBE> may be returned
 402 in the cases of NFC, NFKC, and FCC.
 403
 404 A C<MAYBE> string should contain at least one combining character
 405 or the like. For example, C<COMBINING ACUTE ACCENT> has
 406 the MAYBE_NFC/MAYBE_NFKC property.
 407
 408 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
 409 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
 410 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
 411 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
 412 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
 413
 414 If you want to check exactly, compare the string with its NFC/NFKC/FCC.
 415
 416     if ($string eq NFC($string)) {
 417         # $string is exactly normalized in NFC;
 418     } else {
 419         # $string is not normalized in NFC;
 420     }
 421
 422     if ($string eq NFKC($string)) {
 423         # $string is exactly normalized in NFKC;
 424     } else {
 425         # $string is not normalized in NFKC;
 426     }
 427
 428 =head2 Character Data
 429
 430 These functions are interface of character data used internally.
 431 If you want only to get Unicode normalization forms, you don't need
 432 call them yourself.
 433
 434 =over 4
 435
 436 =item C<$canonical_decomposition = getCanon($code_point)>
 437
 438 If the character is canonically decomposable (including Hangul Syllables),
 439 it returns the (full) canonical decomposition as a string.
 440 Otherwise it returns C<undef>.
 441
 442 B<Note:> According to the Unicode standard, the canonical decomposition
 443 of the character that is not canonically decomposable is same as
 444 the character itself.
 445
 446 =item C<$compatibility_decomposition = getCompat($code_point)>
 447
 448 If the character is compatibility decomposable (including Hangul Syllables),
 449 it returns the (full) compatibility decomposition as a string.
 450 Otherwise it returns C<undef>.
 451
 452 B<Note:> According to the Unicode standard, the compatibility decomposition
 453 of the character that is not compatibility decomposable is same as
 454 the character itself.
 455
 456 =item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
 457
 458 If two characters here and next (as code points) are composable
 459 (including Hangul Jamo/Syllables and Composition Exclusions),
 460 it returns the code point of the composite.
 461
 462 If they are not composable, it returns C<undef>.
 463
 464 =item C<$combining_class = getCombinClass($code_point)>
 465
 466 It returns the combining class (as an integer) of the character.
 467
 468 =item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
 469
 470 It returns a boolean whether the character of the specified codepoint
 471 may be composed with the previous one in a certain composition
 472 (including Hangul Compositions, but excluding
 473 Composition Exclusions and Non-Starter Decompositions).
 474
 475 =item C<$is_exclusion = isExclusion($code_point)>
 476
 477 It returns a boolean whether the code point is a composition exclusion.
 478
 479 =item C<$is_singleton = isSingleton($code_point)>
 480
 481 It returns a boolean whether the code point is a singleton
 482
 483 =item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
 484
 485 It returns a boolean whether the code point has Non-Starter Decomposition.
 486
 487 =item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
 488
 489 It returns a boolean of the derived property Comp_Ex
 490 (Full_Composition_Exclusion). This property is generated from
 491 Composition Exclusions + Singletons + Non-Starter Decompositions.
 492
 493 =item C<$NFD_is_NO = isNFD_NO($code_point)>
 494
 495 It returns a boolean of the derived property NFD_NO
 496 (NFD_Quick_Check=No).
 497
 498 =item C<$NFC_is_NO = isNFC_NO($code_point)>
 499
 500 It returns a boolean of the derived property NFC_NO
 501 (NFC_Quick_Check=No).
 502
 503 =item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
 504
 505 It returns a boolean of the derived property NFC_MAYBE
 506 (NFC_Quick_Check=Maybe).
 507
 508 =item C<$NFKD_is_NO = isNFKD_NO($code_point)>
 509
 510 It returns a boolean of the derived property NFKD_NO
 511 (NFKD_Quick_Check=No).
 512
 513 =item C<$NFKC_is_NO = isNFKC_NO($code_point)>
 514
 515 It returns a boolean of the derived property NFKC_NO
 516 (NFKC_Quick_Check=No).
 517
 518 =item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
 519
 520 It returns a boolean of the derived property NFKC_MAYBE
 521 (NFKC_Quick_Check=Maybe).
 522
 523 =back
 524
 525 =head1 EXPORT
 526
 527 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 528
 529 C<normalize> and other some functions: on request.
 530
 531 =head1 CAVEATS
 532
 533 =over 4
 534
 535 =item Perl's version vs. Unicode version
 536
 537 Since this module refers to perl core's Unicode database in the directory
 538 F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
 539 normalization implemented by this module depends on your perl's version.
 540
 541     perl's version     implemented Unicode version
 542        5.6.1              3.0.1
 543        5.7.2              3.1.0
 544        5.7.3              3.1.1 (normalization is same as 3.1.0)
 545        5.8.0              3.2.0
 546      5.8.1-5.8.3          4.0.0
 547      5.8.4-5.8.6          4.0.1 (normalization is same as 4.0.0)
 548      5.8.7-5.8.8          4.1.0
 549        5.10.0             5.0.0
 550      5.8.9, 5.10.1        5.1.0
 551
 552 =item Correction of decomposition mapping
 553
 554 In older Unicode versions, a small number of characters (all of which are
 555 CJK compatibility ideographs as far as they have been found) may have
 556 an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
 557 Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
 558 nor provide any specific version of normalization. Therefore this module
 559 running on an older perl with an older Unicode database may use
 560 the erroneous decomposition mapping blindly conforming to the Unicode database.
 561
 562 =item Revised definition of canonical composition
 563
 564 In Unicode 4.1.0, the definition D2 of canonical composition (which
 565 affects NFC and NFKC) has been changed (see Public Review Issue #29
 566 and recent UAX #15). This module has used the newer definition
 567 since the version 0.07 (Oct 31, 2001).
 568 This module will not support the normalization according to the older
 569 definition, even if the Unicode version implemented by perl is
 570 lower than 4.1.0.
 571
 572 =back
 573
 574 =head1 AUTHOR
 575
 576 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
 577
 578 Copyright(C) 2001-2010, SADAHIRO Tomoyuki. Japan. All rights reserved.
 579
 580 This module is free software; you can redistribute it
 581 and/or modify it under the same terms as Perl itself.
 582
 583 =head1 SEE ALSO
 584
 585 =over 4
 586
 587 =item http://www.unicode.org/reports/tr15/
 588
 589 Unicode Normalization Forms - UAX #15
 590
 591 =item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
 592
 593 Composition Exclusion Table
 594
 595 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
 596
 597 Derived Normalization Properties
 598
 599 =item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
 600
 601 Normalization Corrections
 602
 603 =item http://www.unicode.org/review/pr-29.html
 604
 605 Public Review Issue #29: Normalization Issue
 606
 607 =item http://www.unicode.org/notes/tn5/
 608
 609 Canonical Equivalence in Applications - UTN #5
 610
 611 =back
 612
 613 =cut