cpan/Unicode-Normalize/Normalize.pm

   1 package Unicode::Normalize;
   2
   3 BEGIN {
   4     unless ('A' eq pack('U', 0x41)) {
   5         die "Unicode::Normalize cannot stringify a Unicode code point\n";
   6     }
   7     unless (0x41 == unpack('U', 'A')) {
   8         die "Unicode::Normalize cannot get Unicode code point\n";
   9     }
  10 }
  11
  12 use 5.006;
  13 use strict;
  14 use warnings;
  15 use Carp;
  16
  17 no warnings 'utf8';
  18
  19 our $VERSION = '1.25';
  20 our $PACKAGE = __PACKAGE__;
  21
  22 our @EXPORT = qw( NFC NFD NFKC NFKD );
  23 our @EXPORT_OK = qw(
  24     normalize decompose reorder compose
  25     checkNFD checkNFKD checkNFC checkNFKC check
  26     getCanon getCompat getComposite getCombinClass
  27     isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
  28     isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
  29     FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter
  30     normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial
  31 );
  32 our %EXPORT_TAGS = (
  33     all       => [ @EXPORT, @EXPORT_OK ],
  34     normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
  35     check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  36     fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
  37 );
  38
  39 ##
  40 ## utilities for tests
  41 ##
  42
  43 sub pack_U {
  44     return pack('U*', @_);
  45 }
  46
  47 sub unpack_U {
  48
  49     # The empty pack returns an empty UTF-8 string, so the effect is to force
  50     # the shifted parameter into being UTF-8.  This allows this to work on
  51     # Perl 5.6, where there is no utf8::upgrade().
  52     return unpack('U*', shift(@_).pack('U*'));
  53 }
  54
  55 require Exporter;
  56
  57 ##### The above part is common to XS and PP #####
  58
  59 our @ISA = qw(Exporter DynaLoader);
  60 require DynaLoader;
  61 bootstrap Unicode::Normalize $VERSION;
  62
  63 ##### The below part is common to XS and PP #####
  64
  65 ##
  66 ## normalize
  67 ##
  68
  69 sub FCD ($) {
  70     my $str = shift;
  71     return checkFCD($str) ? $str : NFD($str);
  72 }
  73
  74 our %formNorm = (
  75     NFC  => \&NFC,      C  => \&NFC,
  76     NFD  => \&NFD,      D  => \&NFD,
  77     NFKC => \&NFKC,     KC => \&NFKC,
  78     NFKD => \&NFKD,     KD => \&NFKD,
  79     FCD  => \&FCD,      FCC => \&FCC,
  80 );
  81
  82 sub normalize($$)
  83 {
  84     my $form = shift;
  85     my $str = shift;
  86     if (exists $formNorm{$form}) {
  87         return $formNorm{$form}->($str);
  88     }
  89     croak($PACKAGE."::normalize: invalid form name: $form");
  90 }
  91
  92 ##
  93 ## partial
  94 ##
  95
  96 sub normalize_partial ($$) {
  97     if (exists $formNorm{$_[0]}) {
  98         my $n = normalize($_[0], $_[1]);
  99         my($p, $u) = splitOnLastStarter($n);
 100         $_[1] = $u;
 101         return $p;
 102     }
 103     croak($PACKAGE."::normalize_partial: invalid form name: $_[0]");
 104 }
 105
 106 sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) }
 107 sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) }
 108 sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) }
 109 sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) }
 110
 111 ##
 112 ## check
 113 ##
 114
 115 our %formCheck = (
 116     NFC  => \&checkNFC,         C  => \&checkNFC,
 117     NFD  => \&checkNFD,         D  => \&checkNFD,
 118     NFKC => \&checkNFKC,        KC => \&checkNFKC,
 119     NFKD => \&checkNFKD,        KD => \&checkNFKD,
 120     FCD  => \&checkFCD,         FCC => \&checkFCC,
 121 );
 122
 123 sub check($$)
 124 {
 125     my $form = shift;
 126     my $str = shift;
 127     if (exists $formCheck{$form}) {
 128         return $formCheck{$form}->($str);
 129     }
 130     croak($PACKAGE."::check: invalid form name: $form");
 131 }
 132
 133 1;
 134 __END__
 135
 136 =head1 NAME
 137
 138 Unicode::Normalize - Unicode Normalization Forms
 139
 140 =head1 SYNOPSIS
 141
 142 (1) using function names exported by default:
 143
 144   use Unicode::Normalize;
 145
 146   $NFD_string  = NFD($string);  # Normalization Form D
 147   $NFC_string  = NFC($string);  # Normalization Form C
 148   $NFKD_string = NFKD($string); # Normalization Form KD
 149   $NFKC_string = NFKC($string); # Normalization Form KC
 150
 151 (2) using function names exported on request:
 152
 153   use Unicode::Normalize 'normalize';
 154
 155   $NFD_string  = normalize('D',  $string);  # Normalization Form D
 156   $NFC_string  = normalize('C',  $string);  # Normalization Form C
 157   $NFKD_string = normalize('KD', $string);  # Normalization Form KD
 158   $NFKC_string = normalize('KC', $string);  # Normalization Form KC
 159
 160 =head1 DESCRIPTION
 161
 162 Parameters:
 163
 164 C<$string> is used as a string under character semantics (see F<perlunicode>).
 165
 166 C<$code_point> should be an unsigned integer representing a Unicode code point.
 167
 168 Note: Between XSUB and pure Perl, there is an incompatibility
 169 about the interpretation of C<$code_point> as a decimal number.
 170 XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
 171 Do not use a floating point nor a negative sign in C<$code_point>.
 172
 173 =head2 Normalization Forms
 174
 175 =over 4
 176
 177 =item C<$NFD_string = NFD($string)>
 178
 179 It returns the Normalization Form D (formed by canonical decomposition).
 180
 181 =item C<$NFC_string = NFC($string)>
 182
 183 It returns the Normalization Form C (formed by canonical decomposition
 184 followed by canonical composition).
 185
 186 =item C<$NFKD_string = NFKD($string)>
 187
 188 It returns the Normalization Form KD (formed by compatibility decomposition).
 189
 190 =item C<$NFKC_string = NFKC($string)>
 191
 192 It returns the Normalization Form KC (formed by compatibility decomposition
 193 followed by B<canonical> composition).
 194
 195 =item C<$FCD_string = FCD($string)>
 196
 197 If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
 198 it returns the string without modification; otherwise it returns an FCD string.
 199
 200 Note: FCD is not always unique, then plural forms may be equivalent
 201 each other. C<FCD()> will return one of these equivalent forms.
 202
 203 =item C<$FCC_string = FCC($string)>
 204
 205 It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
 206
 207 Note: FCC is unique, as well as four normalization forms (NF*).
 208
 209 =item C<$normalized_string = normalize($form_name, $string)>
 210
 211 It returns the normalization form of C<$form_name>.
 212
 213 As C<$form_name>, one of the following names must be given.
 214
 215   'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 216   'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 217   'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 218   'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 219
 220   'FCD'          for "Fast C or D" Form  (UTN #5)
 221   'FCC'          for "Fast C Contiguous" (UTN #5)
 222
 223 =back
 224
 225 =head2 Decomposition and Composition
 226
 227 =over 4
 228
 229 =item C<$decomposed_string = decompose($string [, $useCompatMapping])>
 230
 231 It returns the concatenation of the decomposition of each character
 232 in the string.
 233
 234 If the second parameter (a boolean) is omitted or false,
 235 the decomposition is canonical decomposition;
 236 if the second parameter (a boolean) is true,
 237 the decomposition is compatibility decomposition.
 238
 239 The string returned is not always in NFD/NFKD. Reordering may be required.
 240
 241     $NFD_string  = reorder(decompose($string));       # eq. to NFD()
 242     $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
 243
 244 =item C<$reordered_string = reorder($string)>
 245
 246 It returns the result of reordering the combining characters
 247 according to Canonical Ordering Behavior.
 248
 249 For example, when you have a list of NFD/NFKD strings,
 250 you can get the concatenated NFD/NFKD string from them, by saying
 251
 252     $concat_NFD  = reorder(join '', @NFD_strings);
 253     $concat_NFKD = reorder(join '', @NFKD_strings);
 254
 255 =item C<$composed_string = compose($string)>
 256
 257 It returns the result of canonical composition
 258 without applying any decomposition.
 259
 260 For example, when you have a NFD/NFKD string,
 261 you can get its NFC/NFKC string, by saying
 262
 263     $NFC_string  = compose($NFD_string);
 264     $NFKC_string = compose($NFKD_string);
 265
 266 =item C<($processed, $unprocessed) = splitOnLastStarter($normalized)>
 267
 268 It returns two strings: the first one, C<$processed>, is a part
 269 before the last starter, and the second one, C<$unprocessed> is
 270 another part after the first part. A starter is a character having
 271 a combining class of zero (see UAX #15).
 272
 273 Note that C<$processed> may be empty (when C<$normalized> contains no
 274 starter or starts with the last starter), and then C<$unprocessed>
 275 should be equal to the entire C<$normalized>.
 276
 277 When you have a C<$normalized> string and an C<$unnormalized> string
 278 following it, a simple concatenation is wrong:
 279
 280     $concat = $normalized . normalize($form, $unnormalized); # wrong!
 281
 282 Instead of it, do like this:
 283
 284     ($processed, $unprocessed) = splitOnLastStarter($normalized);
 285      $concat = $processed . normalize($form, $unprocessed.$unnormalized);
 286
 287 C<splitOnLastStarter()> should be called with a pre-normalized parameter
 288 C<$normalized>, that is in the same form as C<$form> you want.
 289
 290 If you have an array of C<@string> that should be concatenated and then
 291 normalized, you can do like this:
 292
 293     my $result = "";
 294     my $unproc = "";
 295     foreach my $str (@string) {
 296         $unproc .= $str;
 297         my $n = normalize($form, $unproc);
 298         my($p, $u) = splitOnLastStarter($n);
 299         $result .= $p;
 300         $unproc  = $u;
 301     }
 302     $result .= $unproc;
 303     # instead of normalize($form, join('', @string))
 304
 305 =item C<$processed = normalize_partial($form, $unprocessed)>
 306
 307 A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>.
 308 Note that C<$unprocessed> will be modified as a side-effect.
 309
 310 If you have an array of C<@string> that should be concatenated and then
 311 normalized, you can do like this:
 312
 313     my $result = "";
 314     my $unproc = "";
 315     foreach my $str (@string) {
 316         $unproc .= $str;
 317         $result .= normalize_partial($form, $unproc);
 318     }
 319     $result .= $unproc;
 320     # instead of normalize($form, join('', @string))
 321
 322 =item C<$processed = NFD_partial($unprocessed)>
 323
 324 It does like C<normalize_partial('NFD', $unprocessed)>.
 325 Note that C<$unprocessed> will be modified as a side-effect.
 326
 327 =item C<$processed = NFC_partial($unprocessed)>
 328
 329 It does like C<normalize_partial('NFC', $unprocessed)>.
 330 Note that C<$unprocessed> will be modified as a side-effect.
 331
 332 =item C<$processed = NFKD_partial($unprocessed)>
 333
 334 It does like C<normalize_partial('NFKD', $unprocessed)>.
 335 Note that C<$unprocessed> will be modified as a side-effect.
 336
 337 =item C<$processed = NFKC_partial($unprocessed)>
 338
 339 It does like C<normalize_partial('NFKC', $unprocessed)>.
 340 Note that C<$unprocessed> will be modified as a side-effect.
 341
 342 =back
 343
 344 =head2 Quick Check
 345
 346 (see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
 347
 348 The following functions check whether the string is in that normalization form.
 349
 350 The result returned will be one of the following:
 351
 352     YES     The string is in that normalization form.
 353     NO      The string is not in that normalization form.
 354     MAYBE   Dubious. Maybe yes, maybe no.
 355
 356 =over 4
 357
 358 =item C<$result = checkNFD($string)>
 359
 360 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 361
 362 =item C<$result = checkNFC($string)>
 363
 364 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 365 C<undef> if C<MAYBE>.
 366
 367 =item C<$result = checkNFKD($string)>
 368
 369 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 370
 371 =item C<$result = checkNFKC($string)>
 372
 373 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 374 C<undef> if C<MAYBE>.
 375
 376 =item C<$result = checkFCD($string)>
 377
 378 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
 379
 380 =item C<$result = checkFCC($string)>
 381
 382 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 383 C<undef> if C<MAYBE>.
 384
 385 Note: If a string is not in FCD, it must not be in FCC.
 386 So C<checkFCC($not_FCD_string)> should return C<NO>.
 387
 388 =item C<$result = check($form_name, $string)>
 389
 390 It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
 391 C<undef> if C<MAYBE>.
 392
 393 As C<$form_name>, one of the following names must be given.
 394
 395   'C'  or 'NFC'  for Normalization Form C  (UAX #15)
 396   'D'  or 'NFD'  for Normalization Form D  (UAX #15)
 397   'KC' or 'NFKC' for Normalization Form KC (UAX #15)
 398   'KD' or 'NFKD' for Normalization Form KD (UAX #15)
 399
 400   'FCD'          for "Fast C or D" Form  (UTN #5)
 401   'FCC'          for "Fast C Contiguous" (UTN #5)
 402
 403 =back
 404
 405 B<Note>
 406
 407 In the cases of NFD, NFKD, and FCD, the answer must be
 408 either C<YES> or C<NO>. The answer C<MAYBE> may be returned
 409 in the cases of NFC, NFKC, and FCC.
 410
 411 A C<MAYBE> string should contain at least one combining character
 412 or the like. For example, C<COMBINING ACUTE ACCENT> has
 413 the MAYBE_NFC/MAYBE_NFKC property.
 414
 415 Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
 416 and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
 417 C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
 418 (its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
 419 while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
 420
 421 If you want to check exactly, compare the string with its NFC/NFKC/FCC.
 422
 423     if ($string eq NFC($string)) {
 424         # $string is exactly normalized in NFC;
 425     } else {
 426         # $string is not normalized in NFC;
 427     }
 428
 429     if ($string eq NFKC($string)) {
 430         # $string is exactly normalized in NFKC;
 431     } else {
 432         # $string is not normalized in NFKC;
 433     }
 434
 435 =head2 Character Data
 436
 437 These functions are interface of character data used internally.
 438 If you want only to get Unicode normalization forms, you don't need
 439 call them yourself.
 440
 441 =over 4
 442
 443 =item C<$canonical_decomposition = getCanon($code_point)>
 444
 445 If the character is canonically decomposable (including Hangul Syllables),
 446 it returns the (full) canonical decomposition as a string.
 447 Otherwise it returns C<undef>.
 448
 449 B<Note:> According to the Unicode standard, the canonical decomposition
 450 of the character that is not canonically decomposable is same as
 451 the character itself.
 452
 453 =item C<$compatibility_decomposition = getCompat($code_point)>
 454
 455 If the character is compatibility decomposable (including Hangul Syllables),
 456 it returns the (full) compatibility decomposition as a string.
 457 Otherwise it returns C<undef>.
 458
 459 B<Note:> According to the Unicode standard, the compatibility decomposition
 460 of the character that is not compatibility decomposable is same as
 461 the character itself.
 462
 463 =item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
 464
 465 If two characters here and next (as code points) are composable
 466 (including Hangul Jamo/Syllables and Composition Exclusions),
 467 it returns the code point of the composite.
 468
 469 If they are not composable, it returns C<undef>.
 470
 471 =item C<$combining_class = getCombinClass($code_point)>
 472
 473 It returns the combining class (as an integer) of the character.
 474
 475 =item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
 476
 477 It returns a boolean whether the character of the specified codepoint
 478 may be composed with the previous one in a certain composition
 479 (including Hangul Compositions, but excluding
 480 Composition Exclusions and Non-Starter Decompositions).
 481
 482 =item C<$is_exclusion = isExclusion($code_point)>
 483
 484 It returns a boolean whether the code point is a composition exclusion.
 485
 486 =item C<$is_singleton = isSingleton($code_point)>
 487
 488 It returns a boolean whether the code point is a singleton
 489
 490 =item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
 491
 492 It returns a boolean whether the code point has Non-Starter Decomposition.
 493
 494 =item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
 495
 496 It returns a boolean of the derived property Comp_Ex
 497 (Full_Composition_Exclusion). This property is generated from
 498 Composition Exclusions + Singletons + Non-Starter Decompositions.
 499
 500 =item C<$NFD_is_NO = isNFD_NO($code_point)>
 501
 502 It returns a boolean of the derived property NFD_NO
 503 (NFD_Quick_Check=No).
 504
 505 =item C<$NFC_is_NO = isNFC_NO($code_point)>
 506
 507 It returns a boolean of the derived property NFC_NO
 508 (NFC_Quick_Check=No).
 509
 510 =item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
 511
 512 It returns a boolean of the derived property NFC_MAYBE
 513 (NFC_Quick_Check=Maybe).
 514
 515 =item C<$NFKD_is_NO = isNFKD_NO($code_point)>
 516
 517 It returns a boolean of the derived property NFKD_NO
 518 (NFKD_Quick_Check=No).
 519
 520 =item C<$NFKC_is_NO = isNFKC_NO($code_point)>
 521
 522 It returns a boolean of the derived property NFKC_NO
 523 (NFKC_Quick_Check=No).
 524
 525 =item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
 526
 527 It returns a boolean of the derived property NFKC_MAYBE
 528 (NFKC_Quick_Check=Maybe).
 529
 530 =back
 531
 532 =head1 EXPORT
 533
 534 C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
 535
 536 C<normalize> and other some functions: on request.
 537
 538 =head1 CAVEATS
 539
 540 =over 4
 541
 542 =item Perl's version vs. Unicode version
 543
 544 Since this module refers to perl core's Unicode database in the directory
 545 F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
 546 normalization implemented by this module depends on what has been
 547 compiled into your perl.  The following table lists the default Unicode
 548 version that comes with various perl versions.  (It is possible to change
 549 the Unicode version in any perl version to be any earlier Unicode version,
 550 so one could cause Unicode 3.2 to be used in any perl version starting with
 551 5.8.0.  See C<$Config{privlib}>/F<unicore/README.perl>.
 552
 553     perl's version     implemented Unicode version
 554        5.6.1              3.0.1
 555        5.7.2              3.1.0
 556        5.7.3              3.1.1 (normalization is same as 3.1.0)
 557        5.8.0              3.2.0
 558          5.8.1-5.8.3      4.0.0
 559          5.8.4-5.8.6      4.0.1 (normalization is same as 4.0.0)
 560          5.8.7-5.8.8      4.1.0
 561        5.10.0             5.0.0
 562         5.8.9, 5.10.1     5.1.0
 563        5.12.x             5.2.0
 564        5.14.x             6.0.0
 565        5.16.x             6.1.0
 566        5.18.x             6.2.0
 567        5.20.x             6.3.0
 568        5.22.x             7.0.0
 569
 570 =item Correction of decomposition mapping
 571
 572 In older Unicode versions, a small number of characters (all of which are
 573 CJK compatibility ideographs as far as they have been found) may have
 574 an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
 575 Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
 576 nor provide any specific version of normalization. Therefore this module
 577 running on an older perl with an older Unicode database may use
 578 the erroneous decomposition mapping blindly conforming to the Unicode database.
 579
 580 =item Revised definition of canonical composition
 581
 582 In Unicode 4.1.0, the definition D2 of canonical composition (which
 583 affects NFC and NFKC) has been changed (see Public Review Issue #29
 584 and recent UAX #15). This module has used the newer definition
 585 since the version 0.07 (Oct 31, 2001).
 586 This module will not support the normalization according to the older
 587 definition, even if the Unicode version implemented by perl is
 588 lower than 4.1.0.
 589
 590 =back
 591
 592 =head1 AUTHOR
 593
 594 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
 595
 596 Currently maintained by <perl5-porters@perl.org>
 597
 598 Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved.
 599
 600 =head1 LICENSE
 601
 602 This module is free software; you can redistribute it
 603 and/or modify it under the same terms as Perl itself.
 604
 605 =head1 SEE ALSO
 606
 607 =over 4
 608
 609 =item http://www.unicode.org/reports/tr15/
 610
 611 Unicode Normalization Forms - UAX #15
 612
 613 =item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
 614
 615 Composition Exclusion Table
 616
 617 =item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
 618
 619 Derived Normalization Properties
 620
 621 =item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
 622
 623 Normalization Corrections
 624
 625 =item http://www.unicode.org/review/pr-29.html
 626
 627 Public Review Issue #29: Normalization Issue
 628
 629 =item http://www.unicode.org/notes/tn5/
 630
 631 Canonical Equivalence in Applications - UTN #5
 632
 633 =back
 634
 635 =cut