This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move SelfLoader from ext/ to dist/
[perl5.git] / ext / Unicode-Normalize / Normalize.pm
CommitLineData
ac5ea531
JH
1package Unicode::Normalize;
2
4a2e806c 3BEGIN {
1efaba7f 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c
JH
6 }
7}
8
ac5ea531
JH
9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
e524f5b2
NC
14no warnings 'utf8';
15
51683ce6 16our $VERSION = '1.03';
ac5ea531
JH
17our $PACKAGE = __PACKAGE__;
18
19require Exporter;
20require DynaLoader;
ac5ea531
JH
21
22our @ISA = qw(Exporter DynaLoader);
23our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45
JH
24our @EXPORT_OK = qw(
25 normalize decompose reorder compose
8f118dcd
JH
26 checkNFD checkNFKD checkNFC checkNFKC check
27 getCanon getCompat getComposite getCombinClass
28 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
29 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
82e740b6
NC
30 FCD checkFCD FCC checkFCC composeContiguous
31 splitOnLastStarter
8f118dcd
JH
32);
33our %EXPORT_TAGS = (
34 all => [ @EXPORT, @EXPORT_OK ],
35 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
36 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
82e740b6 37 fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
2a204b45 38);
ac5ea531 39
82e740b6
NC
40######
41
ac5ea531
JH
42bootstrap Unicode::Normalize $VERSION;
43
82e740b6
NC
44######
45
fe067ad9
SP
46##
47## utilites for tests
48##
49
9f1f04a1 50sub pack_U {
b8d10bc1 51 return pack('U*', @_);
9f1f04a1
RGS
52}
53
54sub unpack_U {
fe067ad9 55 return unpack('U*', shift(@_).pack('U*'));
9f1f04a1
RGS
56}
57
82e740b6
NC
58
59##
60## normalization forms
61##
62
82e740b6
NC
63sub FCD ($) {
64 my $str = shift;
65 return checkFCD($str) ? $str : NFD($str);
66}
82e740b6
NC
67
68our %formNorm = (
69 NFC => \&NFC, C => \&NFC,
70 NFD => \&NFD, D => \&NFD,
71 NFKC => \&NFKC, KC => \&NFKC,
72 NFKD => \&NFKD, KD => \&NFKD,
73 FCD => \&FCD, FCC => \&FCC,
74);
75
ac5ea531
JH
76sub normalize($$)
77{
d85850a7 78 my $form = shift;
f027f502 79 my $str = shift;
fe067ad9
SP
80 if (exists $formNorm{$form}) {
81 return $formNorm{$form}->($str);
82 }
83 croak($PACKAGE."::normalize: invalid form name: $form");
ac5ea531
JH
84}
85
82e740b6
NC
86
87##
88## quick check
89##
90
91our %formCheck = (
92 NFC => \&checkNFC, C => \&checkNFC,
93 NFD => \&checkNFD, D => \&checkNFD,
94 NFKC => \&checkNFKC, KC => \&checkNFKC,
95 NFKD => \&checkNFKD, KD => \&checkNFKD,
96 FCD => \&checkFCD, FCC => \&checkFCC,
97);
98
8f118dcd
JH
99sub check($$)
100{
101 my $form = shift;
f027f502 102 my $str = shift;
fe067ad9
SP
103 if (exists $formCheck{$form}) {
104 return $formCheck{$form}->($str);
105 }
106 croak($PACKAGE."::check: invalid form name: $form");
8f118dcd
JH
107}
108
ac5ea531
JH
1091;
110__END__
2a204b45
JH
111
112=head1 NAME
113
f027f502 114Unicode::Normalize - Unicode Normalization Forms
2a204b45
JH
115
116=head1 SYNOPSIS
117
a092bcfd
RGS
118(1) using function names exported by default:
119
2a204b45
JH
120 use Unicode::Normalize;
121
8f118dcd
JH
122 $NFD_string = NFD($string); # Normalization Form D
123 $NFC_string = NFC($string); # Normalization Form C
124 $NFKD_string = NFKD($string); # Normalization Form KD
125 $NFKC_string = NFKC($string); # Normalization Form KC
2a204b45 126
a092bcfd 127(2) using function names exported on request:
2a204b45
JH
128
129 use Unicode::Normalize 'normalize';
130
8f118dcd
JH
131 $NFD_string = normalize('D', $string); # Normalization Form D
132 $NFC_string = normalize('C', $string); # Normalization Form C
133 $NFKD_string = normalize('KD', $string); # Normalization Form KD
134 $NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45
JH
135
136=head1 DESCRIPTION
137
00f2676f
JH
138Parameters:
139
fe067ad9 140C<$string> is used as a string under character semantics (see F<perlunicode>).
00f2676f 141
fe067ad9 142C<$code_point> should be an unsigned integer representing a Unicode code point.
00f2676f 143
628bbff0 144Note: Between XSUB and pure Perl, there is an incompatibility
fe067ad9
SP
145about the interpretation of C<$code_point> as a decimal number.
146XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
147Do not use a floating point nor a negative sign in C<$code_point>.
00f2676f 148
d85850a7 149=head2 Normalization Forms
2a204b45
JH
150
151=over 4
152
8f118dcd 153=item C<$NFD_string = NFD($string)>
2a204b45 154
fe067ad9 155It returns the Normalization Form D (formed by canonical decomposition).
2a204b45 156
8f118dcd 157=item C<$NFC_string = NFC($string)>
2a204b45 158
fe067ad9 159It returns the Normalization Form C (formed by canonical decomposition
2a204b45
JH
160followed by canonical composition).
161
8f118dcd 162=item C<$NFKD_string = NFKD($string)>
2a204b45 163
fe067ad9 164It returns the Normalization Form KD (formed by compatibility decomposition).
2a204b45 165
8f118dcd 166=item C<$NFKC_string = NFKC($string)>
2a204b45 167
fe067ad9 168It returns the Normalization Form KC (formed by compatibility decomposition
2a204b45
JH
169followed by B<canonical> composition).
170
82e740b6
NC
171=item C<$FCD_string = FCD($string)>
172
173If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
fe067ad9 174it returns the string without modification; otherwise it returns an FCD string.
82e740b6
NC
175
176Note: FCD is not always unique, then plural forms may be equivalent
177each other. C<FCD()> will return one of these equivalent forms.
178
179=item C<$FCC_string = FCC($string)>
180
fe067ad9 181It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
82e740b6 182
e524f5b2 183Note: FCC is unique, as well as four normalization forms (NF*).
82e740b6 184
8f118dcd 185=item C<$normalized_string = normalize($form_name, $string)>
2a204b45 186
fe067ad9
SP
187It returns the normalization form of C<$form_name>.
188
2a204b45
JH
189As C<$form_name>, one of the following names must be given.
190
82e740b6
NC
191 'C' or 'NFC' for Normalization Form C (UAX #15)
192 'D' or 'NFD' for Normalization Form D (UAX #15)
193 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
194 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
195
196 'FCD' for "Fast C or D" Form (UTN #5)
197 'FCC' for "Fast C Contiguous" (UTN #5)
2a204b45
JH
198
199=back
200
8f118dcd
JH
201=head2 Decomposition and Composition
202
203=over 4
204
fe067ad9 205=item C<$decomposed_string = decompose($string [, $useCompatMapping])>
8f118dcd 206
fe067ad9
SP
207It returns the concatenation of the decomposition of each character
208in the string.
8f118dcd 209
fe067ad9
SP
210If the second parameter (a boolean) is omitted or false,
211the decomposition is canonical decomposition;
212if the second parameter (a boolean) is true,
213the decomposition is compatibility decomposition.
8f118dcd 214
fe067ad9 215The string returned is not always in NFD/NFKD. Reordering may be required.
8f118dcd
JH
216
217 $NFD_string = reorder(decompose($string)); # eq. to NFD()
218 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
219
fe067ad9 220=item C<$reordered_string = reorder($string)>
8f118dcd 221
fe067ad9
SP
222It returns the result of reordering the combining characters
223according to Canonical Ordering Behavior.
8f118dcd 224
fe067ad9
SP
225For example, when you have a list of NFD/NFKD strings,
226you can get the concatenated NFD/NFKD string from them, by saying
8f118dcd
JH
227
228 $concat_NFD = reorder(join '', @NFD_strings);
229 $concat_NFKD = reorder(join '', @NFKD_strings);
230
fe067ad9 231=item C<$composed_string = compose($string)>
8f118dcd 232
fe067ad9
SP
233It returns the result of canonical composition
234without applying any decomposition.
8f118dcd 235
fe067ad9
SP
236For example, when you have a NFD/NFKD string,
237you can get its NFC/NFKC string, by saying
8f118dcd
JH
238
239 $NFC_string = compose($NFD_string);
240 $NFKC_string = compose($NFKD_string);
241
242=back
243
244=head2 Quick Check
245
82e740b6 246(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
8f118dcd
JH
247
248The following functions check whether the string is in that normalization form.
249
fe067ad9 250The result returned will be one of the following:
8f118dcd
JH
251
252 YES The string is in that normalization form.
253 NO The string is not in that normalization form.
254 MAYBE Dubious. Maybe yes, maybe no.
255
256=over 4
257
258=item C<$result = checkNFD($string)>
259
fe067ad9 260It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
8f118dcd
JH
261
262=item C<$result = checkNFC($string)>
263
fe067ad9 264It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 265C<undef> if C<MAYBE>.
8f118dcd
JH
266
267=item C<$result = checkNFKD($string)>
268
fe067ad9 269It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
8f118dcd
JH
270
271=item C<$result = checkNFKC($string)>
272
fe067ad9 273It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 274C<undef> if C<MAYBE>.
8f118dcd 275
82e740b6
NC
276=item C<$result = checkFCD($string)>
277
fe067ad9 278It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
82e740b6
NC
279
280=item C<$result = checkFCC($string)>
281
fe067ad9 282It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 283C<undef> if C<MAYBE>.
82e740b6 284
fe067ad9 285Note: If a string is not in FCD, it must not be in FCC.
82e740b6
NC
286So C<checkFCC($not_FCD_string)> should return C<NO>.
287
8f118dcd
JH
288=item C<$result = check($form_name, $string)>
289
fe067ad9 290It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0 291C<undef> if C<MAYBE>.
8f118dcd 292
628bbff0
RGS
293As C<$form_name>, one of the following names must be given.
294
295 'C' or 'NFC' for Normalization Form C (UAX #15)
296 'D' or 'NFD' for Normalization Form D (UAX #15)
297 'KC' or 'NFKC' for Normalization Form KC (UAX #15)
298 'KD' or 'NFKD' for Normalization Form KD (UAX #15)
299
300 'FCD' for "Fast C or D" Form (UTN #5)
301 'FCC' for "Fast C Contiguous" (UTN #5)
8f118dcd
JH
302
303=back
304
305B<Note>
306
82e740b6
NC
307In the cases of NFD, NFKD, and FCD, the answer must be
308either C<YES> or C<NO>. The answer C<MAYBE> may be returned
309in the cases of NFC, NFKC, and FCC.
8f118dcd 310
82e740b6
NC
311A C<MAYBE> string should contain at least one combining character
312or the like. For example, C<COMBINING ACUTE ACCENT> has
8f118dcd 313the MAYBE_NFC/MAYBE_NFKC property.
82e740b6 314
8f118dcd
JH
315Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
316and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502 317C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd
JH
318(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
319while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
320
628bbff0
RGS
321If you want to check exactly, compare the string with its NFC/NFKC/FCC.
322
323 if ($string eq NFC($string)) {
324 # $string is exactly normalized in NFC;
325 } else {
326 # $string is not normalized in NFC;
327 }
8f118dcd 328
628bbff0
RGS
329 if ($string eq NFKC($string)) {
330 # $string is exactly normalized in NFKC;
331 } else {
332 # $string is not normalized in NFKC;
333 }
8f118dcd 334
2a204b45
JH
335=head2 Character Data
336
337These functions are interface of character data used internally.
d0ed0342
BG
338If you want only to get Unicode normalization forms, you don't need
339call them yourself.
2a204b45
JH
340
341=over 4
342
fe067ad9 343=item C<$canonical_decomposition = getCanon($code_point)>
2a204b45 344
fe067ad9
SP
345If the character is canonically decomposable (including Hangul Syllables),
346it returns the (full) canonical decomposition as a string.
347Otherwise it returns C<undef>.
8f118dcd 348
fe067ad9
SP
349B<Note:> According to the Unicode standard, the canonical decomposition
350of the character that is not canonically decomposable is same as
351the character itself.
8f118dcd 352
fe067ad9 353=item C<$compatibility_decomposition = getCompat($code_point)>
2a204b45 354
fe067ad9
SP
355If the character is compatibility decomposable (including Hangul Syllables),
356it returns the (full) compatibility decomposition as a string.
357Otherwise it returns C<undef>.
2a204b45 358
fe067ad9
SP
359B<Note:> According to the Unicode standard, the compatibility decomposition
360of the character that is not compatibility decomposable is same as
361the character itself.
2a204b45 362
fe067ad9 363=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
2a204b45 364
fe067ad9 365If two characters here and next (as code points) are composable
8f118dcd 366(including Hangul Jamo/Syllables and Composition Exclusions),
fe067ad9
SP
367it returns the code point of the composite.
368
369If they are not composable, it returns C<undef>.
2a204b45 370
fe067ad9 371=item C<$combining_class = getCombinClass($code_point)>
2a204b45 372
fe067ad9 373It returns the combining class (as an integer) of the character.
2a204b45 374
fe067ad9 375=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
2a204b45 376
fe067ad9
SP
377It returns a boolean whether the character of the specified codepoint
378may be composed with the previous one in a certain composition
379(including Hangul Compositions, but excluding
380Composition Exclusions and Non-Starter Decompositions).
2a204b45 381
fe067ad9 382=item C<$is_exclusion = isExclusion($code_point)>
8f118dcd 383
fe067ad9 384It returns a boolean whether the code point is a composition exclusion.
8f118dcd 385
fe067ad9 386=item C<$is_singleton = isSingleton($code_point)>
8f118dcd 387
fe067ad9 388It returns a boolean whether the code point is a singleton
8f118dcd 389
fe067ad9 390=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
8f118dcd 391
fe067ad9 392It returns a boolean whether the code point has Non-Starter Decomposition.
8f118dcd 393
fe067ad9
SP
394=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
395
396It returns a boolean of the derived property Comp_Ex
397(Full_Composition_Exclusion). This property is generated from
398Composition Exclusions + Singletons + Non-Starter Decompositions.
399
400=item C<$NFD_is_NO = isNFD_NO($code_point)>
401
402It returns a boolean of the derived property NFD_NO
403(NFD_Quick_Check=No).
404
405=item C<$NFC_is_NO = isNFC_NO($code_point)>
406
407It returns a boolean of the derived property NFC_NO
408(NFC_Quick_Check=No).
409
410=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
411
412It returns a boolean of the derived property NFC_MAYBE
413(NFC_Quick_Check=Maybe).
414
415=item C<$NFKD_is_NO = isNFKD_NO($code_point)>
416
417It returns a boolean of the derived property NFKD_NO
418(NFKD_Quick_Check=No).
419
420=item C<$NFKC_is_NO = isNFKC_NO($code_point)>
421
422It returns a boolean of the derived property NFKC_NO
423(NFKC_Quick_Check=No).
424
425=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
426
427It returns a boolean of the derived property NFKC_MAYBE
428(NFKC_Quick_Check=Maybe).
2a204b45
JH
429
430=back
431
628bbff0 432=head1 EXPORT
2a204b45
JH
433
434C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
435
436C<normalize> and other some functions: on request.
437
628bbff0
RGS
438=head1 CAVEATS
439
440=over 4
441
442=item Perl's version vs. Unicode version
443
444Since this module refers to perl core's Unicode database in the directory
445F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
446normalization implemented by this module depends on your perl's version.
447
fe067ad9
SP
448 perl's version implemented Unicode version
449 5.6.1 3.0.1
450 5.7.2 3.1.0
451 5.7.3 3.1.1 (normalization is same as 3.1.0)
452 5.8.0 3.2.0
453 5.8.1-5.8.3 4.0.0
454 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0)
455 5.8.7-5.8.8 4.1.0
51683ce6
TS
456 5.10.0 5.0.0
457 5.8.9 5.1.0
628bbff0
RGS
458
459=item Correction of decomposition mapping
460
461In older Unicode versions, a small number of characters (all of which are
462CJK compatibility ideographs as far as they have been found) may have
463an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
464Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
465nor provide any specific version of normalization. Therefore this module
466running on an older perl with an older Unicode database may use
467the erroneous decomposition mapping blindly conforming to the Unicode database.
468
469=item Revised definition of canonical composition
470
471In Unicode 4.1.0, the definition D2 of canonical composition (which
472affects NFC and NFKC) has been changed (see Public Review Issue #29
473and recent UAX #15). This module has used the newer definition
474since the version 0.07 (Oct 31, 2001).
2b8d773d 475This module will not support the normalization according to the older
628bbff0
RGS
476definition, even if the Unicode version implemented by perl is
477lower than 4.1.0.
478
479=back
480
2a204b45
JH
481=head1 AUTHOR
482
a092bcfd 483SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
2a204b45 484
2b8d773d 485Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45 486
628bbff0
RGS
487This module is free software; you can redistribute it
488and/or modify it under the same terms as Perl itself.
2a204b45
JH
489
490=head1 SEE ALSO
491
492=over 4
493
e524f5b2 494=item http://www.unicode.org/reports/tr15/
2a204b45
JH
495
496Unicode Normalization Forms - UAX #15
497
fe067ad9
SP
498=item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
499
500Composition Exclusion Table
501
14e6b36c 502=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd
JH
503
504Derived Normalization Properties
505
628bbff0
RGS
506=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
507
508Normalization Corrections
509
510=item http://www.unicode.org/review/pr-29.html
511
512Public Review Issue #29: Normalization Issue
513
82e740b6
NC
514=item http://www.unicode.org/notes/tn5/
515
516Canonical Equivalence in Applications - UTN #5
517
2a204b45
JH
518=back
519
520=cut