[perl5.git] / ext / Unicode-Normalize / Normalize.pm

package Unicode::Normalize;

BEGIN {
    unless ("A" eq pack('U', 0x41)) {
	die "Unicode::Normalize cannot stringify a Unicode code point\n";
    }
}

use 5.006;
use strict;
use warnings;
use Carp;

no warnings 'utf8';

our $VERSION = '1.03';
our $PACKAGE = __PACKAGE__;

require Exporter;
require DynaLoader;

our @ISA = qw(Exporter DynaLoader);
our @EXPORT = qw( NFC NFD NFKC NFKD );
our @EXPORT_OK = qw(
    normalize decompose reorder compose
    checkNFD checkNFKD checkNFC checkNFKC check
    getCanon getCompat getComposite getCombinClass
    isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
    isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
    FCD checkFCD FCC checkFCC composeContiguous
    splitOnLastStarter
);
our %EXPORT_TAGS = (
    all       => [ @EXPORT, @EXPORT_OK ],
    normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
    check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
    fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
);

######

bootstrap Unicode::Normalize $VERSION;

######

##
## utilites for tests
##

sub pack_U {
    return pack('U*', @_);
}

sub unpack_U {
    return unpack('U*', shift(@_).pack('U*'));
}


##
## normalization forms
##

sub FCD ($) {
    my $str = shift;
    return checkFCD($str) ? $str : NFD($str);
}

our %formNorm = (
    NFC  => \&NFC,	C  => \&NFC,
    NFD  => \&NFD,	D  => \&NFD,
    NFKC => \&NFKC,	KC => \&NFKC,
    NFKD => \&NFKD,	KD => \&NFKD,
    FCD  => \&FCD,	FCC => \&FCC,
);

sub normalize($$)
{
    my $form = shift;
    my $str = shift;
    if (exists $formNorm{$form}) {
	return $formNorm{$form}->($str);
    }
    croak($PACKAGE."::normalize: invalid form name: $form");
}


##
## quick check
##

our %formCheck = (
    NFC  => \&checkNFC, 	C  => \&checkNFC,
    NFD  => \&checkNFD, 	D  => \&checkNFD,
    NFKC => \&checkNFKC,	KC => \&checkNFKC,
    NFKD => \&checkNFKD,	KD => \&checkNFKD,
    FCD  => \&checkFCD, 	FCC => \&checkFCC,
);

sub check($$)
{
    my $form = shift;
    my $str = shift;
    if (exists $formCheck{$form}) {
	return $formCheck{$form}->($str);
    }
    croak($PACKAGE."::check: invalid form name: $form");
}

1;
__END__

=head1 NAME

Unicode::Normalize - Unicode Normalization Forms

=head1 SYNOPSIS

(1) using function names exported by default:

  use Unicode::Normalize;

  $NFD_string  = NFD($string);  # Normalization Form D
  $NFC_string  = NFC($string);  # Normalization Form C
  $NFKD_string = NFKD($string); # Normalization Form KD
  $NFKC_string = NFKC($string); # Normalization Form KC

(2) using function names exported on request:

  use Unicode::Normalize 'normalize';

  $NFD_string  = normalize('D',  $string);  # Normalization Form D
  $NFC_string  = normalize('C',  $string);  # Normalization Form C
  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
  $NFKC_string = normalize('KC', $string);  # Normalization Form KC

=head1 DESCRIPTION

Parameters:

C<$string> is used as a string under character semantics (see F<perlunicode>).

C<$code_point> should be an unsigned integer representing a Unicode code point.

Note: Between XSUB and pure Perl, there is an incompatibility
about the interpretation of C<$code_point> as a decimal number.
XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
Do not use a floating point nor a negative sign in C<$code_point>.

=head2 Normalization Forms

=over 4

=item C<$NFD_string = NFD($string)>

It returns the Normalization Form D (formed by canonical decomposition).

=item C<$NFC_string = NFC($string)>

It returns the Normalization Form C (formed by canonical decomposition
followed by canonical composition).

=item C<$NFKD_string = NFKD($string)>

It returns the Normalization Form KD (formed by compatibility decomposition).

=item C<$NFKC_string = NFKC($string)>

It returns the Normalization Form KC (formed by compatibility decomposition
followed by B<canonical> composition).

=item C<$FCD_string = FCD($string)>

If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
it returns the string without modification; otherwise it returns an FCD string.

Note: FCD is not always unique, then plural forms may be equivalent
each other. C<FCD()> will return one of these equivalent forms.

=item C<$FCC_string = FCC($string)>

It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).

Note: FCC is unique, as well as four normalization forms (NF*).

=item C<$normalized_string = normalize($form_name, $string)>

It returns the normalization form of C<$form_name>.

As C<$form_name>, one of the following names must be given.

  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
  'KD' or 'NFKD' for Normalization Form KD (UAX #15)

  'FCD'          for "Fast C or D" Form  (UTN #5)
  'FCC'          for "Fast C Contiguous" (UTN #5)

=back

=head2 Decomposition and Composition

=over 4

=item C<$decomposed_string = decompose($string [, $useCompatMapping])>

It returns the concatenation of the decomposition of each character
in the string.

If the second parameter (a boolean) is omitted or false,
the decomposition is canonical decomposition;
if the second parameter (a boolean) is true,
the decomposition is compatibility decomposition.

The string returned is not always in NFD/NFKD. Reordering may be required.

    $NFD_string  = reorder(decompose($string));       # eq. to NFD()
    $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()

=item C<$reordered_string = reorder($string)>

It returns the result of reordering the combining characters
according to Canonical Ordering Behavior.

For example, when you have a list of NFD/NFKD strings,
you can get the concatenated NFD/NFKD string from them, by saying

    $concat_NFD  = reorder(join '', @NFD_strings);
    $concat_NFKD = reorder(join '', @NFKD_strings);

=item C<$composed_string = compose($string)>

It returns the result of canonical composition
without applying any decomposition.

For example, when you have a NFD/NFKD string,
you can get its NFC/NFKC string, by saying

    $NFC_string  = compose($NFD_string);
    $NFKC_string = compose($NFKD_string);

=back

=head2 Quick Check

(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)

The following functions check whether the string is in that normalization form.

The result returned will be one of the following:

    YES     The string is in that normalization form.
    NO      The string is not in that normalization form.
    MAYBE   Dubious. Maybe yes, maybe no.

=over 4

=item C<$result = checkNFD($string)>

It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.

=item C<$result = checkNFC($string)>

It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
C<undef> if C<MAYBE>.

=item C<$result = checkNFKD($string)>

It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.

=item C<$result = checkNFKC($string)>

It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
C<undef> if C<MAYBE>.

=item C<$result = checkFCD($string)>

It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.

=item C<$result = checkFCC($string)>

It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
C<undef> if C<MAYBE>.

Note: If a string is not in FCD, it must not be in FCC.
So C<checkFCC($not_FCD_string)> should return C<NO>.

=item C<$result = check($form_name, $string)>

It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
C<undef> if C<MAYBE>.

As C<$form_name>, one of the following names must be given.

  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
  'KD' or 'NFKD' for Normalization Form KD (UAX #15)

  'FCD'          for "Fast C or D" Form  (UTN #5)
  'FCC'          for "Fast C Contiguous" (UTN #5)

=back

B<Note>

In the cases of NFD, NFKD, and FCD, the answer must be
either C<YES> or C<NO>. The answer C<MAYBE> may be returned
in the cases of NFC, NFKC, and FCC.

A C<MAYBE> string should contain at least one combining character
or the like. For example, C<COMBINING ACUTE ACCENT> has
the MAYBE_NFC/MAYBE_NFKC property.

Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.

If you want to check exactly, compare the string with its NFC/NFKC/FCC.

    if ($string eq NFC($string)) {
	# $string is exactly normalized in NFC;
    } else {
	# $string is not normalized in NFC;
    }

    if ($string eq NFKC($string)) {
	# $string is exactly normalized in NFKC;
    } else {
	# $string is not normalized in NFKC;
    }

=head2 Character Data

These functions are interface of character data used internally.
If you want only to get Unicode normalization forms, you don't need
call them yourself.

=over 4

=item C<$canonical_decomposition = getCanon($code_point)>

If the character is canonically decomposable (including Hangul Syllables),
it returns the (full) canonical decomposition as a string.
Otherwise it returns C<undef>.

B<Note:> According to the Unicode standard, the canonical decomposition
of the character that is not canonically decomposable is same as
the character itself.

=item C<$compatibility_decomposition = getCompat($code_point)>

If the character is compatibility decomposable (including Hangul Syllables),
it returns the (full) compatibility decomposition as a string.
Otherwise it returns C<undef>.

B<Note:> According to the Unicode standard, the compatibility decomposition
of the character that is not compatibility decomposable is same as
the character itself.

=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>

If two characters here and next (as code points) are composable
(including Hangul Jamo/Syllables and Composition Exclusions),
it returns the code point of the composite.

If they are not composable, it returns C<undef>.

=item C<$combining_class = getCombinClass($code_point)>

It returns the combining class (as an integer) of the character.

=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>

It returns a boolean whether the character of the specified codepoint
may be composed with the previous one in a certain composition
(including Hangul Compositions, but excluding
Composition Exclusions and Non-Starter Decompositions).

=item C<$is_exclusion = isExclusion($code_point)>

It returns a boolean whether the code point is a composition exclusion.

=item C<$is_singleton = isSingleton($code_point)>

It returns a boolean whether the code point is a singleton

=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>

It returns a boolean whether the code point has Non-Starter Decomposition.

=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>

It returns a boolean of the derived property Comp_Ex
(Full_Composition_Exclusion). This property is generated from
Composition Exclusions + Singletons + Non-Starter Decompositions.

=item C<$NFD_is_NO = isNFD_NO($code_point)>

It returns a boolean of the derived property NFD_NO
(NFD_Quick_Check=No).

=item C<$NFC_is_NO = isNFC_NO($code_point)>

It returns a boolean of the derived property NFC_NO
(NFC_Quick_Check=No).

=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>

It returns a boolean of the derived property NFC_MAYBE
(NFC_Quick_Check=Maybe).

=item C<$NFKD_is_NO = isNFKD_NO($code_point)>

It returns a boolean of the derived property NFKD_NO
(NFKD_Quick_Check=No).

=item C<$NFKC_is_NO = isNFKC_NO($code_point)>

It returns a boolean of the derived property NFKC_NO
(NFKC_Quick_Check=No).

=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>

It returns a boolean of the derived property NFKC_MAYBE
(NFKC_Quick_Check=Maybe).

=back

=head1 EXPORT

C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.

C<normalize> and other some functions: on request.

=head1 CAVEATS

=over 4

=item Perl's version vs. Unicode version

Since this module refers to perl core's Unicode database in the directory
F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
normalization implemented by this module depends on your perl's version.

    perl's version     implemented Unicode version
       5.6.1              3.0.1
       5.7.2              3.1.0
       5.7.3              3.1.1 (normalization is same as 3.1.0)
       5.8.0              3.2.0
     5.8.1-5.8.3          4.0.0
     5.8.4-5.8.6          4.0.1 (normalization is same as 4.0.0)
     5.8.7-5.8.8          4.1.0
       5.10.0             5.0.0
       5.8.9              5.1.0

=item Correction of decomposition mapping

In older Unicode versions, a small number of characters (all of which are
CJK compatibility ideographs as far as they have been found) may have
an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
nor provide any specific version of normalization. Therefore this module
running on an older perl with an older Unicode database may use
the erroneous decomposition mapping blindly conforming to the Unicode database.

=item Revised definition of canonical composition

In Unicode 4.1.0, the definition D2 of canonical composition (which
affects NFC and NFKC) has been changed (see Public Review Issue #29
and recent UAX #15). This module has used the newer definition
since the version 0.07 (Oct 31, 2001).
This module will not support the normalization according to the older
definition, even if the Unicode version implemented by perl is
lower than 4.1.0.

=back

=head1 AUTHOR

SADAHIRO Tomoyuki <SADAHIRO@cpan.org>

Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved.

This module is free software; you can redistribute it
and/or modify it under the same terms as Perl itself.

=head1 SEE ALSO

=over 4

=item http://www.unicode.org/reports/tr15/

Unicode Normalization Forms - UAX #15

=item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt

Composition Exclusion Table

=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt

Derived Normalization Properties

=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt

Normalization Corrections

=item http://www.unicode.org/review/pr-29.html

Public Review Issue #29: Normalization Issue

=item http://www.unicode.org/notes/tn5/

Canonical Equivalence in Applications - UTN #5

=back

=cut
Commit	Line	Data
ac5ea531 JH	1	package Unicode::Normalize;
ac5ea531 JH	2
4a2e806c	3	BEGIN {
1efaba7f	4	unless ("A" eq pack('U', 0x41)) {
9f1f04a1	5	die "Unicode::Normalize cannot stringify a Unicode code point\n";
4a2e806c JH	6	}
	7	}
	8
ac5ea531 JH	9	use 5.006;
	10	use strict;
	11	use warnings;
	12	use Carp;
	13
e524f5b2 NC	14	no warnings 'utf8';
e524f5b2 NC	15
51683ce6	16	our $VERSION = '1.03';
ac5ea531 JH	17	our $PACKAGE = __PACKAGE__;
	18
	19	require Exporter;
	20	require DynaLoader;
ac5ea531 JH	21
	22	our @ISA = qw(Exporter DynaLoader);
	23	our @EXPORT = qw( NFC NFD NFKC NFKD );
2a204b45 JH	24	our @EXPORT_OK = qw(
2a204b45 JH	25	normalize decompose reorder compose
8f118dcd JH	26	checkNFD checkNFKD checkNFC checkNFKC check
	27	getCanon getCompat getComposite getCombinClass
	28	isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
	29	isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
82e740b6 NC	30	FCD checkFCD FCC checkFCC composeContiguous
82e740b6 NC	31	splitOnLastStarter
8f118dcd JH	32	);
	33	our %EXPORT_TAGS = (
	34	all => [ @EXPORT, @EXPORT_OK ],
	35	normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
	36	check => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
82e740b6	37	fast => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
2a204b45	38	);
ac5ea531	39
82e740b6 NC	40	######
82e740b6 NC	41
ac5ea531 JH	42	bootstrap Unicode::Normalize $VERSION;
ac5ea531 JH	43
82e740b6 NC	44	######
82e740b6 NC	45
fe067ad9 SP	46	##
	47	## utilites for tests
	48	##
	49
9f1f04a1	50	sub pack_U {
b8d10bc1	51	return pack('U*', @_);
9f1f04a1 RGS	52	}
	53
	54	sub unpack_U {
fe067ad9	55	return unpack('U', shift(@_).pack('U'));
9f1f04a1 RGS	56	}
9f1f04a1 RGS	57
82e740b6 NC	58
	59	##
	60	## normalization forms
	61	##
	62
82e740b6 NC	63	sub FCD ($) {
	64	my $str = shift;
	65	return checkFCD($str) ? $str : NFD($str);
	66	}
82e740b6 NC	67
	68	our %formNorm = (
	69	NFC => \&NFC, C => \&NFC,
	70	NFD => \&NFD, D => \&NFD,
	71	NFKC => \&NFKC, KC => \&NFKC,
	72	NFKD => \&NFKD, KD => \&NFKD,
	73	FCD => \&FCD, FCC => \&FCC,
	74	);
	75
ac5ea531 JH	76	sub normalize($$)
ac5ea531 JH	77	{
d85850a7	78	my $form = shift;
f027f502	79	my $str = shift;
fe067ad9 SP	80	if (exists $formNorm{$form}) {
	81	return $formNorm{$form}->($str);
	82	}
	83	croak($PACKAGE."::normalize: invalid form name: $form");
ac5ea531 JH	84	}
ac5ea531 JH	85
82e740b6 NC	86
	87	##
	88	## quick check
	89	##
	90
	91	our %formCheck = (
	92	NFC => \&checkNFC, C => \&checkNFC,
	93	NFD => \&checkNFD, D => \&checkNFD,
	94	NFKC => \&checkNFKC, KC => \&checkNFKC,
	95	NFKD => \&checkNFKD, KD => \&checkNFKD,
	96	FCD => \&checkFCD, FCC => \&checkFCC,
	97	);
	98
8f118dcd JH	99	sub check($$)
	100	{
	101	my $form = shift;
f027f502	102	my $str = shift;
fe067ad9 SP	103	if (exists $formCheck{$form}) {
	104	return $formCheck{$form}->($str);
	105	}
	106	croak($PACKAGE."::check: invalid form name: $form");
8f118dcd JH	107	}
8f118dcd JH	108
ac5ea531 JH	109	1;
ac5ea531 JH	110	__END__
2a204b45 JH	111
	112	=head1 NAME
	113
f027f502	114	Unicode::Normalize - Unicode Normalization Forms
2a204b45 JH	115
	116	=head1 SYNOPSIS
	117
a092bcfd RGS	118	(1) using function names exported by default:
a092bcfd RGS	119
2a204b45 JH	120	use Unicode::Normalize;
2a204b45 JH	121
8f118dcd JH	122	$NFD_string = NFD($string); # Normalization Form D
	123	$NFC_string = NFC($string); # Normalization Form C
	124	$NFKD_string = NFKD($string); # Normalization Form KD
	125	$NFKC_string = NFKC($string); # Normalization Form KC
2a204b45	126
a092bcfd	127	(2) using function names exported on request:
2a204b45 JH	128
	129	use Unicode::Normalize 'normalize';
	130
8f118dcd JH	131	$NFD_string = normalize('D', $string); # Normalization Form D
	132	$NFC_string = normalize('C', $string); # Normalization Form C
	133	$NFKD_string = normalize('KD', $string); # Normalization Form KD
	134	$NFKC_string = normalize('KC', $string); # Normalization Form KC
2a204b45 JH	135
	136	=head1 DESCRIPTION
	137
00f2676f JH	138	Parameters:
00f2676f JH	139
fe067ad9	140	C<$string> is used as a string under character semantics (see F<perlunicode>).
00f2676f	141
fe067ad9	142	C<$code_point> should be an unsigned integer representing a Unicode code point.
00f2676f	143
628bbff0	144	Note: Between XSUB and pure Perl, there is an incompatibility
fe067ad9 SP	145	about the interpretation of C<$code_point> as a decimal number.
	146	XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
	147	Do not use a floating point nor a negative sign in C<$code_point>.
00f2676f	148
d85850a7	149	=head2 Normalization Forms
2a204b45 JH	150
	151	=over 4
	152
8f118dcd	153	=item C<$NFD_string = NFD($string)>
2a204b45	154
fe067ad9	155	It returns the Normalization Form D (formed by canonical decomposition).
2a204b45	156
8f118dcd	157	=item C<$NFC_string = NFC($string)>
2a204b45	158
fe067ad9	159	It returns the Normalization Form C (formed by canonical decomposition
2a204b45 JH	160	followed by canonical composition).
2a204b45 JH	161
8f118dcd	162	=item C<$NFKD_string = NFKD($string)>
2a204b45	163
fe067ad9	164	It returns the Normalization Form KD (formed by compatibility decomposition).
2a204b45	165
8f118dcd	166	=item C<$NFKC_string = NFKC($string)>
2a204b45	167
fe067ad9	168	It returns the Normalization Form KC (formed by compatibility decomposition
2a204b45 JH	169	followed by B<canonical> composition).
2a204b45 JH	170
82e740b6 NC	171	=item C<$FCD_string = FCD($string)>
	172
	173	If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
fe067ad9	174	it returns the string without modification; otherwise it returns an FCD string.
82e740b6 NC	175
	176	Note: FCD is not always unique, then plural forms may be equivalent
	177	each other. C<FCD()> will return one of these equivalent forms.
	178
	179	=item C<$FCC_string = FCC($string)>
	180
fe067ad9	181	It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
82e740b6	182
e524f5b2	183	Note: FCC is unique, as well as four normalization forms (NF*).
82e740b6	184
8f118dcd	185	=item C<$normalized_string = normalize($form_name, $string)>
2a204b45	186
fe067ad9 SP	187	It returns the normalization form of C<$form_name>.
fe067ad9 SP	188
2a204b45 JH	189	As C<$form_name>, one of the following names must be given.
2a204b45 JH	190
82e740b6 NC	191	'C' or 'NFC' for Normalization Form C (UAX #15)
	192	'D' or 'NFD' for Normalization Form D (UAX #15)
	193	'KC' or 'NFKC' for Normalization Form KC (UAX #15)
	194	'KD' or 'NFKD' for Normalization Form KD (UAX #15)
	195
	196	'FCD' for "Fast C or D" Form (UTN #5)
	197	'FCC' for "Fast C Contiguous" (UTN #5)
2a204b45 JH	198
	199	=back
	200
8f118dcd JH	201	=head2 Decomposition and Composition
	202
	203	=over 4
	204
fe067ad9	205	=item C<$decomposed_string = decompose($string [, $useCompatMapping])>
8f118dcd	206
fe067ad9 SP	207	It returns the concatenation of the decomposition of each character
fe067ad9 SP	208	in the string.
8f118dcd	209
fe067ad9 SP	210	If the second parameter (a boolean) is omitted or false,
	211	the decomposition is canonical decomposition;
	212	if the second parameter (a boolean) is true,
	213	the decomposition is compatibility decomposition.
8f118dcd	214
fe067ad9	215	The string returned is not always in NFD/NFKD. Reordering may be required.
8f118dcd JH	216
	217	$NFD_string = reorder(decompose($string)); # eq. to NFD()
	218	$NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
	219
fe067ad9	220	=item C<$reordered_string = reorder($string)>
8f118dcd	221
fe067ad9 SP	222	It returns the result of reordering the combining characters
fe067ad9 SP	223	according to Canonical Ordering Behavior.
8f118dcd	224
fe067ad9 SP	225	For example, when you have a list of NFD/NFKD strings,
fe067ad9 SP	226	you can get the concatenated NFD/NFKD string from them, by saying
8f118dcd JH	227
	228	$concat_NFD = reorder(join '', @NFD_strings);
	229	$concat_NFKD = reorder(join '', @NFKD_strings);
	230
fe067ad9	231	=item C<$composed_string = compose($string)>
8f118dcd	232
fe067ad9 SP	233	It returns the result of canonical composition
fe067ad9 SP	234	without applying any decomposition.
8f118dcd	235
fe067ad9 SP	236	For example, when you have a NFD/NFKD string,
fe067ad9 SP	237	you can get its NFC/NFKC string, by saying
8f118dcd JH	238
	239	$NFC_string = compose($NFD_string);
	240	$NFKC_string = compose($NFKD_string);
	241
	242	=back
	243
	244	=head2 Quick Check
	245
82e740b6	246	(see Annex 8, UAX #15; and F<DerivedNormalizationProps.txt>)
8f118dcd JH	247
	248	The following functions check whether the string is in that normalization form.
	249
fe067ad9	250	The result returned will be one of the following:
8f118dcd JH	251
	252	YES The string is in that normalization form.
	253	NO The string is not in that normalization form.
	254	MAYBE Dubious. Maybe yes, maybe no.
	255
	256	=over 4
	257
	258	=item C<$result = checkNFD($string)>
	259
fe067ad9	260	It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
8f118dcd JH	261
	262	=item C<$result = checkNFC($string)>
	263
fe067ad9	264	It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0	265	C<undef> if C<MAYBE>.
8f118dcd JH	266
	267	=item C<$result = checkNFKD($string)>
	268
fe067ad9	269	It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
8f118dcd JH	270
	271	=item C<$result = checkNFKC($string)>
	272
fe067ad9	273	It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0	274	C<undef> if C<MAYBE>.
8f118dcd	275
82e740b6 NC	276	=item C<$result = checkFCD($string)>
82e740b6 NC	277
fe067ad9	278	It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
82e740b6 NC	279
	280	=item C<$result = checkFCC($string)>
	281
fe067ad9	282	It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0	283	C<undef> if C<MAYBE>.
82e740b6	284
fe067ad9	285	Note: If a string is not in FCD, it must not be in FCC.
82e740b6 NC	286	So C<checkFCC($not_FCD_string)> should return C<NO>.
82e740b6 NC	287
8f118dcd JH	288	=item C<$result = check($form_name, $string)>
8f118dcd JH	289
fe067ad9	290	It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
628bbff0	291	C<undef> if C<MAYBE>.
8f118dcd	292
628bbff0 RGS	293	As C<$form_name>, one of the following names must be given.
	294
	295	'C' or 'NFC' for Normalization Form C (UAX #15)
	296	'D' or 'NFD' for Normalization Form D (UAX #15)
	297	'KC' or 'NFKC' for Normalization Form KC (UAX #15)
	298	'KD' or 'NFKD' for Normalization Form KD (UAX #15)
	299
	300	'FCD' for "Fast C or D" Form (UTN #5)
	301	'FCC' for "Fast C Contiguous" (UTN #5)
8f118dcd JH	302
	303	=back
	304
	305	B<Note>
	306
82e740b6 NC	307	In the cases of NFD, NFKD, and FCD, the answer must be
	308	either C<YES> or C<NO>. The answer C<MAYBE> may be returned
	309	in the cases of NFC, NFKC, and FCC.
8f118dcd	310
82e740b6 NC	311	A C<MAYBE> string should contain at least one combining character
82e740b6 NC	312	or the like. For example, C<COMBINING ACUTE ACCENT> has
8f118dcd	313	the MAYBE_NFC/MAYBE_NFKC property.
82e740b6	314
8f118dcd JH	315	Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
8f118dcd JH	316	and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
f027f502	317	C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
8f118dcd JH	318	(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
	319	while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
	320
628bbff0 RGS	321	If you want to check exactly, compare the string with its NFC/NFKC/FCC.
	322
	323	if ($string eq NFC($string)) {
	324	# $string is exactly normalized in NFC;
	325	} else {
	326	# $string is not normalized in NFC;
	327	}
8f118dcd	328
628bbff0 RGS	329	if ($string eq NFKC($string)) {
	330	# $string is exactly normalized in NFKC;
	331	} else {
	332	# $string is not normalized in NFKC;
	333	}
8f118dcd	334
2a204b45 JH	335	=head2 Character Data
	336
	337	These functions are interface of character data used internally.
d0ed0342 BG	338	If you want only to get Unicode normalization forms, you don't need
d0ed0342 BG	339	call them yourself.
2a204b45 JH	340
	341	=over 4
	342
fe067ad9	343	=item C<$canonical_decomposition = getCanon($code_point)>
2a204b45	344
fe067ad9 SP	345	If the character is canonically decomposable (including Hangul Syllables),
	346	it returns the (full) canonical decomposition as a string.
	347	Otherwise it returns C<undef>.
8f118dcd	348
fe067ad9 SP	349	B<Note:> According to the Unicode standard, the canonical decomposition
	350	of the character that is not canonically decomposable is same as
	351	the character itself.
8f118dcd	352
fe067ad9	353	=item C<$compatibility_decomposition = getCompat($code_point)>
2a204b45	354
fe067ad9 SP	355	If the character is compatibility decomposable (including Hangul Syllables),
	356	it returns the (full) compatibility decomposition as a string.
	357	Otherwise it returns C<undef>.
2a204b45	358
fe067ad9 SP	359	B<Note:> According to the Unicode standard, the compatibility decomposition
	360	of the character that is not compatibility decomposable is same as
	361	the character itself.
2a204b45	362
fe067ad9	363	=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
2a204b45	364
fe067ad9	365	If two characters here and next (as code points) are composable
8f118dcd	366	(including Hangul Jamo/Syllables and Composition Exclusions),
fe067ad9 SP	367	it returns the code point of the composite.
	368
	369	If they are not composable, it returns C<undef>.
2a204b45	370
fe067ad9	371	=item C<$combining_class = getCombinClass($code_point)>
2a204b45	372
fe067ad9	373	It returns the combining class (as an integer) of the character.
2a204b45	374
fe067ad9	375	=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
2a204b45	376
fe067ad9 SP	377	It returns a boolean whether the character of the specified codepoint
	378	may be composed with the previous one in a certain composition
	379	(including Hangul Compositions, but excluding
	380	Composition Exclusions and Non-Starter Decompositions).
2a204b45	381
fe067ad9	382	=item C<$is_exclusion = isExclusion($code_point)>
8f118dcd	383
fe067ad9	384	It returns a boolean whether the code point is a composition exclusion.
8f118dcd	385
fe067ad9	386	=item C<$is_singleton = isSingleton($code_point)>
8f118dcd	387
fe067ad9	388	It returns a boolean whether the code point is a singleton
8f118dcd	389
fe067ad9	390	=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
8f118dcd	391
fe067ad9	392	It returns a boolean whether the code point has Non-Starter Decomposition.
8f118dcd	393
fe067ad9 SP	394	=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
	395
	396	It returns a boolean of the derived property Comp_Ex
	397	(Full_Composition_Exclusion). This property is generated from
	398	Composition Exclusions + Singletons + Non-Starter Decompositions.
	399
	400	=item C<$NFD_is_NO = isNFD_NO($code_point)>
	401
	402	It returns a boolean of the derived property NFD_NO
	403	(NFD_Quick_Check=No).
	404
	405	=item C<$NFC_is_NO = isNFC_NO($code_point)>
	406
	407	It returns a boolean of the derived property NFC_NO
	408	(NFC_Quick_Check=No).
	409
	410	=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
	411
	412	It returns a boolean of the derived property NFC_MAYBE
	413	(NFC_Quick_Check=Maybe).
	414
	415	=item C<$NFKD_is_NO = isNFKD_NO($code_point)>
	416
	417	It returns a boolean of the derived property NFKD_NO
	418	(NFKD_Quick_Check=No).
	419
	420	=item C<$NFKC_is_NO = isNFKC_NO($code_point)>
	421
	422	It returns a boolean of the derived property NFKC_NO
	423	(NFKC_Quick_Check=No).
	424
	425	=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
	426
	427	It returns a boolean of the derived property NFKC_MAYBE
	428	(NFKC_Quick_Check=Maybe).
2a204b45 JH	429
	430	=back
	431
628bbff0	432	=head1 EXPORT
2a204b45 JH	433
	434	C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
	435
	436	C<normalize> and other some functions: on request.
	437
628bbff0 RGS	438	=head1 CAVEATS
	439
	440	=over 4
	441
	442	=item Perl's version vs. Unicode version
	443
	444	Since this module refers to perl core's Unicode database in the directory
	445	F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
	446	normalization implemented by this module depends on your perl's version.
	447
fe067ad9 SP	448	perl's version implemented Unicode version
	449	5.6.1 3.0.1
	450	5.7.2 3.1.0
	451	5.7.3 3.1.1 (normalization is same as 3.1.0)
	452	5.8.0 3.2.0
	453	5.8.1-5.8.3 4.0.0
	454	5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0)
	455	5.8.7-5.8.8 4.1.0
51683ce6 TS	456	5.10.0 5.0.0
51683ce6 TS	457	5.8.9 5.1.0
628bbff0 RGS	458
	459	=item Correction of decomposition mapping
	460
	461	In older Unicode versions, a small number of characters (all of which are
	462	CJK compatibility ideographs as far as they have been found) may have
	463	an erroneous decomposition mapping (see F<NormalizationCorrections.txt>).
	464	Anyhow, this module will neither refer to F<NormalizationCorrections.txt>
	465	nor provide any specific version of normalization. Therefore this module
	466	running on an older perl with an older Unicode database may use
	467	the erroneous decomposition mapping blindly conforming to the Unicode database.
	468
	469	=item Revised definition of canonical composition
	470
	471	In Unicode 4.1.0, the definition D2 of canonical composition (which
	472	affects NFC and NFKC) has been changed (see Public Review Issue #29
	473	and recent UAX #15). This module has used the newer definition
	474	since the version 0.07 (Oct 31, 2001).
2b8d773d	475	This module will not support the normalization according to the older
628bbff0 RGS	476	definition, even if the Unicode version implemented by perl is
	477	lower than 4.1.0.
	478
	479	=back
	480
2a204b45 JH	481	=head1 AUTHOR
2a204b45 JH	482
a092bcfd	483	SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
2a204b45	484
2b8d773d	485	Copyright(C) 2001-2007, SADAHIRO Tomoyuki. Japan. All rights reserved.
2a204b45	486
628bbff0 RGS	487	This module is free software; you can redistribute it
628bbff0 RGS	488	and/or modify it under the same terms as Perl itself.
2a204b45 JH	489
	490	=head1 SEE ALSO
	491
	492	=over 4
	493
e524f5b2	494	=item http://www.unicode.org/reports/tr15/
2a204b45 JH	495
	496	Unicode Normalization Forms - UAX #15
	497
fe067ad9 SP	498	=item http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt
	499
	500	Composition Exclusion Table
	501
14e6b36c	502	=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt
8f118dcd JH	503
	504	Derived Normalization Properties
	505
628bbff0 RGS	506	=item http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt
	507
	508	Normalization Corrections
	509
	510	=item http://www.unicode.org/review/pr-29.html
	511
	512	Public Review Issue #29: Normalization Issue
	513
82e740b6 NC	514	=item http://www.unicode.org/notes/tn5/
	515
	516	Canonical Equivalence in Applications - UTN #5
	517
2a204b45 JH	518	=back
	519
	520	=cut