This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Another Unicode-3.2-induced doc fix
[perl5.git] / ext / Unicode / Normalize / Normalize.pm
CommitLineData
ac5ea531
JH
1package Unicode::Normalize;
2
4a2e806c
JH
3BEGIN {
4 if (ord("A") == 193) {
be32ca57 5 die "Unicode::Normalize not ported to EBCDIC\n";
4a2e806c
JH
6 }
7}
8
ac5ea531
JH
9use 5.006;
10use strict;
11use warnings;
12use Carp;
13
be32ca57 14our $VERSION = '0.16';
ac5ea531
JH
15our $PACKAGE = __PACKAGE__;
16
17require Exporter;
18require DynaLoader;
19require AutoLoader;
20
21our @ISA = qw(Exporter DynaLoader);
22our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45
JH
23our @EXPORT_OK = qw(
24 normalize decompose reorder compose
8f118dcd
JH
25 checkNFD checkNFKD checkNFC checkNFKC check
26 getCanon getCompat getComposite getCombinClass
27 isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
28 isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
29);
30our %EXPORT_TAGS = (
31 all => [ @EXPORT, @EXPORT_OK ],
32 normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
33 check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
2a204b45 34);
ac5ea531
JH
35
36bootstrap Unicode::Normalize $VERSION;
37
ac5ea531
JH
38use constant COMPAT => 1;
39
d85850a7 40sub NFD ($) { reorder(decompose($_[0])) }
ac5ea531 41sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
d85850a7 42sub NFC ($) { compose(reorder(decompose($_[0]))) }
ac5ea531
JH
43sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
44
45sub normalize($$)
46{
d85850a7
TS
47 my $form = shift;
48 $form =~ s/^NF//;
49 return
50 $form eq 'D' ? NFD ($_[0]) :
51 $form eq 'C' ? NFC ($_[0]) :
52 $form eq 'KD' ? NFKD($_[0]) :
53 $form eq 'KC' ? NFKC($_[0]) :
54 croak $PACKAGE."::normalize: invalid form name: $form";
ac5ea531
JH
55}
56
8f118dcd
JH
57sub check($$)
58{
59 my $form = shift;
60 $form =~ s/^NF//;
61 return
62 $form eq 'D' ? checkNFD ($_[0]) :
63 $form eq 'C' ? checkNFC ($_[0]) :
64 $form eq 'KD' ? checkNFKD($_[0]) :
65 $form eq 'KC' ? checkNFKC($_[0]) :
66 croak $PACKAGE."::check: invalid form name: $form";
67}
68
ac5ea531
JH
691;
70__END__
2a204b45
JH
71
72=head1 NAME
73
74Unicode::Normalize - normalized forms of Unicode text
75
76=head1 SYNOPSIS
77
78 use Unicode::Normalize;
79
8f118dcd
JH
80 $NFD_string = NFD($string); # Normalization Form D
81 $NFC_string = NFC($string); # Normalization Form C
82 $NFKD_string = NFKD($string); # Normalization Form KD
83 $NFKC_string = NFKC($string); # Normalization Form KC
2a204b45
JH
84
85 or
86
87 use Unicode::Normalize 'normalize';
88
8f118dcd
JH
89 $NFD_string = normalize('D', $string); # Normalization Form D
90 $NFC_string = normalize('C', $string); # Normalization Form C
91 $NFKD_string = normalize('KD', $string); # Normalization Form KD
92 $NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45
JH
93
94=head1 DESCRIPTION
95
d85850a7 96=head2 Normalization Forms
2a204b45
JH
97
98=over 4
99
8f118dcd 100=item C<$NFD_string = NFD($string)>
2a204b45
JH
101
102returns the Normalization Form D (formed by canonical decomposition).
103
8f118dcd 104=item C<$NFC_string = NFC($string)>
2a204b45
JH
105
106returns the Normalization Form C (formed by canonical decomposition
107followed by canonical composition).
108
8f118dcd 109=item C<$NFKD_string = NFKD($string)>
2a204b45
JH
110
111returns the Normalization Form KD (formed by compatibility decomposition).
112
8f118dcd 113=item C<$NFKC_string = NFKC($string)>
2a204b45
JH
114
115returns the Normalization Form KC (formed by compatibility decomposition
116followed by B<canonical> composition).
117
8f118dcd 118=item C<$normalized_string = normalize($form_name, $string)>
2a204b45
JH
119
120As C<$form_name>, one of the following names must be given.
121
122 'C' or 'NFC' for Normalization Form C
123 'D' or 'NFD' for Normalization Form D
124 'KC' or 'NFKC' for Normalization Form KC
125 'KD' or 'NFKD' for Normalization Form KD
126
127=back
128
8f118dcd
JH
129=head2 Decomposition and Composition
130
131=over 4
132
133=item C<$decomposed_string = decompose($string)>
134
135=item C<$decomposed_string = decompose($string, $useCompatMapping)>
136
137Decompose the specified string and returns the result.
138
139If the second parameter (a boolean) is omitted or false, decomposes it
140using the Canonical Decomposition Mapping.
141If true, decomposes it using the Compatibility Decomposition Mapping.
142
143The string returned is not always in NFD/NFKD.
144Reordering may be required.
145
146 $NFD_string = reorder(decompose($string)); # eq. to NFD()
147 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
148
149=item C<$reordered_string = reorder($string)>
150
151Reorder the combining characters and the like in the canonical ordering
152and returns the result.
153
154E.g., when you have a list of NFD/NFKD strings,
155you can get the concatenated NFD/NFKD string from them, saying
156
157 $concat_NFD = reorder(join '', @NFD_strings);
158 $concat_NFKD = reorder(join '', @NFKD_strings);
159
160=item C<$composed_string = compose($string)>
161
162Returns the string where composable pairs are composed.
163
164E.g., when you have a NFD/NFKD string,
165you can get its NFC/NFKC string, saying
166
167 $NFC_string = compose($NFD_string);
168 $NFKC_string = compose($NFKD_string);
169
170=back
171
172=head2 Quick Check
173
14e6b36c 174(see Annex 8, UAX #15; F<DerivedNormalizationProps.txt>)
8f118dcd
JH
175
176The following functions check whether the string is in that normalization form.
177
178The result returned will be:
179
180 YES The string is in that normalization form.
181 NO The string is not in that normalization form.
182 MAYBE Dubious. Maybe yes, maybe no.
183
184=over 4
185
186=item C<$result = checkNFD($string)>
187
188returns YES (1) or NO (empty string).
189
190=item C<$result = checkNFC($string)>
191
192returns YES (1), NO (empty string), or MAYBE (undef).
193
194=item C<$result = checkNFKD($string)>
195
196returns YES (1) or NO (empty string).
197
198=item C<$result = checkNFKC($string)>
199
200returns YES (1), NO (empty string), or MAYBE (undef).
201
202=item C<$result = check($form_name, $string)>
203
204returns YES (1), NO (empty string), or MAYBE (undef).
205
206C<$form_name> is alike to that for C<normalize()>.
207
208=back
209
210B<Note>
211
212In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
213The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
214
215A MAYBE-NFC/NFKC string should contain at least
216one combining character or the like.
217For example, C<COMBINING ACUTE ACCENT> has
218the MAYBE_NFC/MAYBE_NFKC property.
219Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
220and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
221Though, C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
222(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
223while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
224
225If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
226
227 $string eq NFC($string) # more thorough than checkNFC($string)
228 $string eq NFKC($string) # more thorough than checkNFKC($string)
229
2a204b45
JH
230=head2 Character Data
231
232These functions are interface of character data used internally.
d0ed0342
BG
233If you want only to get Unicode normalization forms, you don't need
234call them yourself.
2a204b45
JH
235
236=over 4
237
238=item C<$canonical_decomposed = getCanon($codepoint)>
239
8f118dcd
JH
240If the character of the specified codepoint is canonically
241decomposable (including Hangul Syllables),
242returns the B<completely decomposed> string canonically equivalent to it.
243
244If it is not decomposable, returns undef.
245
2a204b45
JH
246=item C<$compatibility_decomposed = getCompat($codepoint)>
247
8f118dcd
JH
248If the character of the specified codepoint is compatibility
249decomposable (including Hangul Syllables),
250returns the B<completely decomposed> string compatibility equivalent to it.
2a204b45
JH
251
252If it is not decomposable, returns undef.
253
8f118dcd 254=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
2a204b45 255
d85850a7 256If two characters here and next (as codepoints) are composable
8f118dcd 257(including Hangul Jamo/Syllables and Composition Exclusions),
2a204b45
JH
258returns the codepoint of the composite.
259
260If they are not composable, returns undef.
261
262=item C<$combining_class = getCombinClass($codepoint)>
263
8f118dcd 264Returns the combining class of the character as an integer.
2a204b45
JH
265
266=item C<$is_exclusion = isExclusion($codepoint)>
267
8f118dcd
JH
268Returns a boolean whether the character of the specified codepoint
269is a composition exclusion.
270
271=item C<$is_singleton = isSingleton($codepoint)>
272
2a204b45 273Returns a boolean whether the character of the specified codepoint is
8f118dcd
JH
274a singleton.
275
276=item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>
277
278Returns a boolean whether the canonical decomposition
279of the character of the specified codepoint
280is a Non-Starter Decomposition.
281
282=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
283
284Returns a boolean whether the character of the specified codepoint
285may be composed with the previous one in a certain composition
286(including Hangul Compositions, but excluding
287Composition Exclusions and Non-Starter Decompositions).
2a204b45
JH
288
289=back
290
291=head2 EXPORT
292
293C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
294
295C<normalize> and other some functions: on request.
296
297=head1 AUTHOR
298
299SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
300
301 http://homepage1.nifty.com/nomenclator/perl/
302
ab8fe378 303 Copyright(C) 2001-2002, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45
JH
304
305 This program is free software; you can redistribute it and/or
306 modify it under the same terms as Perl itself.
307
308=head1 SEE ALSO
309
310=over 4
311
312=item http://www.unicode.org/unicode/reports/tr15/
313
314Unicode Normalization Forms - UAX #15
315
14e6b36c 316=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd
JH
317
318Derived Normalization Properties
319
2a204b45
JH
320=back
321
322=cut
323