Update to Unicode::Normalize 0.15 (+ the EBCDIC guards)

[perl5.git] / ext / Unicode / Normalize / Normalize.pm
diff --git a/ext/Unicode/Normalize/Normalize.pm b/ext/Unicode/Normalize/Normalize.pm

index cec5fa7..b3bc9d6 100644 (file)
--- a/ext/Unicode/Normalize/Normalize.pm
+++ b/ext/Unicode/Normalize/Normalize.pm
@@ -2,7 +2,7 @@ package Unicode::Normalize;
  
  BEGIN {
      if (ord("A") == 193) {
-       die "Unicode::Normalize not ported to EBCDIC\n";
+       die "Unicode::Normalize not ported to EBCDIC\n";
      }
  }
  
@@ -11,7 +11,7 @@ use strict;
  use warnings;
  use Carp;
  
-our $VERSION = '0.14';
+our $VERSION = '0.15';
  our $PACKAGE = __PACKAGE__;
  
  require Exporter;
@@ -22,9 +22,16 @@ our @ISA = qw(Exporter DynaLoader);
  our @EXPORT = qw( NFC NFD NFKC NFKD );
  our @EXPORT_OK = qw(
      normalize decompose reorder compose
-    getCanon getCompat getComposite getCombinClass isExclusion
+    checkNFD checkNFKD checkNFC checkNFKC check
+    getCanon getCompat getComposite getCombinClass
+    isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
+    isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
+);
+our %EXPORT_TAGS = (
+    all       => [ @EXPORT, @EXPORT_OK ],
+    normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
+    check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
  );
-our %EXPORT_TAGS = ( all => [ @EXPORT, @EXPORT_OK ] );
  
  bootstrap Unicode::Normalize $VERSION;
  
@@ -32,7 +39,6 @@ use constant COMPAT => 1;
  
  sub NFD  ($) { reorder(decompose($_[0])) }
  sub NFKD ($) { reorder(decompose($_[0], COMPAT)) }
-
  sub NFC  ($) { compose(reorder(decompose($_[0]))) }
  sub NFKC ($) { compose(reorder(decompose($_[0], COMPAT))) }
  
@@ -48,6 +54,18 @@ sub normalize($$)
        croak $PACKAGE."::normalize: invalid form name: $form";
  }
  
+sub check($$)
+{
+    my $form = shift;
+    $form =~ s/^NF//;
+    return
+       $form eq 'D'  ? checkNFD ($_[0]) :
+       $form eq 'C'  ? checkNFC ($_[0]) :
+       $form eq 'KD' ? checkNFKD($_[0]) :
+       $form eq 'KC' ? checkNFKC($_[0]) :
+      croak $PACKAGE."::check: invalid form name: $form";
+}
+
  1;
  __END__
  
@@ -59,19 +77,19 @@ Unicode::Normalize - normalized forms of Unicode text
  
    use Unicode::Normalize;
  
-  $string_NFD  = NFD($raw_string);  # Normalization Form D
-  $string_NFC  = NFC($raw_string);  # Normalization Form C
-  $string_NFKD = NFKD($raw_string); # Normalization Form KD
-  $string_NFKC = NFKC($raw_string); # Normalization Form KC
+  $NFD_string  = NFD($string);  # Normalization Form D
+  $NFC_string  = NFC($string);  # Normalization Form C
+  $NFKD_string = NFKD($string); # Normalization Form KD
+  $NFKC_string = NFKC($string); # Normalization Form KC
  
     or
  
    use Unicode::Normalize 'normalize';
  
-  $string_NFD  = normalize('D',  $raw_string);  # Normalization Form D
-  $string_NFC  = normalize('C',  $raw_string);  # Normalization Form C
-  $string_NFKD = normalize('KD', $raw_string);  # Normalization Form KD
-  $string_NFKC = normalize('KC', $raw_string);  # Normalization Form KC
+  $NFD_string  = normalize('D',  $string);  # Normalization Form D
+  $NFC_string  = normalize('C',  $string);  # Normalization Form C
+  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
+  $NFKC_string = normalize('KC', $string);  # Normalization Form KC
  
  =head1 DESCRIPTION
  
@@ -79,26 +97,25 @@ Unicode::Normalize - normalized forms of Unicode text
  
  =over 4
  
-=item C<$string_NFD = NFD($raw_string)>
+=item C<$NFD_string = NFD($string)>
  
  returns the Normalization Form D (formed by canonical decomposition).
  
-
-=item C<$string_NFC = NFC($raw_string)>
+=item C<$NFC_string = NFC($string)>
  
  returns the Normalization Form C (formed by canonical decomposition
  followed by canonical composition).
  
-=item C<$string_NFKD = NFKD($raw_string)>
+=item C<$NFKD_string = NFKD($string)>
  
  returns the Normalization Form KD (formed by compatibility decomposition).
  
-=item C<$string_NFKC = NFKC($raw_string)>
+=item C<$NFKC_string = NFKC($string)>
  
  returns the Normalization Form KC (formed by compatibility decomposition
  followed by B<canonical> composition).
  
-=item C<$normalized_string = normalize($form_name, $raw_string)>
+=item C<$normalized_string = normalize($form_name, $string)>
  
  As C<$form_name>, one of the following names must be given.
  
@@ -109,6 +126,107 @@ As C<$form_name>, one of the following names must be given.
  
  =back
  
+=head2 Decomposition and Composition
+
+=over 4
+
+=item C<$decomposed_string = decompose($string)>
+
+=item C<$decomposed_string = decompose($string, $useCompatMapping)>
+
+Decompose the specified string and returns the result.
+
+If the second parameter (a boolean) is omitted or false, decomposes it
+using the Canonical Decomposition Mapping.
+If true, decomposes it using the Compatibility Decomposition Mapping.
+
+The string returned is not always in NFD/NFKD.
+Reordering may be required.
+
+    $NFD_string  = reorder(decompose($string));       # eq. to NFD()
+    $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
+
+=item C<$reordered_string  = reorder($string)>
+
+Reorder the combining characters and the like in the canonical ordering
+and returns the result.
+
+E.g., when you have a list of NFD/NFKD strings,
+you can get the concatenated NFD/NFKD string from them, saying
+
+    $concat_NFD  = reorder(join '', @NFD_strings);
+    $concat_NFKD = reorder(join '', @NFKD_strings);
+
+=item C<$composed_string   = compose($string)>
+
+Returns the string where composable pairs are composed.
+
+E.g., when you have a NFD/NFKD string,
+you can get its NFC/NFKC string, saying
+
+    $NFC_string  = compose($NFD_string);
+    $NFKC_string = compose($NFKD_string);
+
+=back
+
+=head2 Quick Check
+
+(see Annex 8, UAX #15; F<DerivedNormalizationProperties.txt>)
+
+The following functions check whether the string is in that normalization form.
+
+The result returned will be:
+
+    YES     The string is in that normalization form.
+    NO      The string is not in that normalization form.
+    MAYBE   Dubious. Maybe yes, maybe no.
+
+=over 4
+
+=item C<$result = checkNFD($string)>
+
+returns YES (1) or NO (empty string).
+
+=item C<$result = checkNFC($string)>
+
+returns YES (1), NO (empty string), or MAYBE (undef).
+
+=item C<$result = checkNFKD($string)>
+
+returns YES (1) or NO (empty string).
+
+=item C<$result = checkNFKC($string)>
+
+returns YES (1), NO (empty string), or MAYBE (undef).
+
+=item C<$result = check($form_name, $string)>
+
+returns YES (1), NO (empty string), or MAYBE (undef).
+
+C<$form_name> is alike to that for C<normalize()>.
+
+=back
+
+B<Note>
+
+In the cases of NFD and NFKD, the answer must be either C<YES> or C<NO>.
+The answer C<MAYBE> may be returned in the cases of NFC and NFKC.
+
+A MAYBE-NFC/NFKC string should contain at least
+one combining character or the like.
+For example, C<COMBINING ACUTE ACCENT> has
+the MAYBE_NFC/MAYBE_NFKC property.
+Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
+and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
+Though, C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC 
+(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
+while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
+
+If you want to check exactly, compare the string with its NFC/NFKC; i.e.,
+
+    $string eq NFC($string)    # more thorough than checkNFC($string)
+    $string eq NFKC($string)   # more thorough than checkNFKC($string)
+
  =head2 Character Data
  
  These functions are interface of character data used internally.
@@ -119,30 +237,54 @@ call them yourself.
  
  =item C<$canonical_decomposed = getCanon($codepoint)>
  
+If the character of the specified codepoint is canonically
+decomposable (including Hangul Syllables),
+returns the B<completely decomposed> string canonically equivalent to it.
+
+If it is not decomposable, returns undef.
+
  =item C<$compatibility_decomposed = getCompat($codepoint)>
  
-If the character of the specified codepoint is canonically or 
-compatibility decomposable (including Hangul Syllables),
-returns the B<completely decomposed> string equivalent to it.
+If the character of the specified codepoint is compatibility
+decomposable (including Hangul Syllables),
+returns the B<completely decomposed> string compatibility equivalent to it.
  
  If it is not decomposable, returns undef.
  
-=item C<$uv_composite = getComposite($uv_here, $uv_next)>
+=item C<$codepoint_composite = getComposite($codepoint_here, $codepoint_next)>
  
  If two characters here and next (as codepoints) are composable
-(including Hangul Jamo/Syllables and Exclusions),
+(including Hangul Jamo/Syllables and Composition Exclusions),
  returns the codepoint of the composite.
  
  If they are not composable, returns undef.
  
  =item C<$combining_class = getCombinClass($codepoint)>
  
-Returns the combining class as integer of the character.
+Returns the combining class of the character as an integer.
  
  =item C<$is_exclusion = isExclusion($codepoint)>
  
+Returns a boolean whether the character of the specified codepoint
+is a composition exclusion.
+
+=item C<$is_singleton = isSingleton($codepoint)>
+
  Returns a boolean whether the character of the specified codepoint is
-a composition exclusion.
+a singleton.
+
+=item C<$is_non_startar_decomposition = isNonStDecomp($codepoint)>
+
+Returns a boolean whether the canonical decomposition
+of the character of the specified codepoint
+is a Non-Starter Decomposition.
+
+=item C<$may_be_composed_with_prev_char = isComp2nd($codepoint)>
+
+Returns a boolean whether the character of the specified codepoint
+may be composed with the previous one in a certain composition
+(including Hangul Compositions, but excluding
+Composition Exclusions and Non-Starter Decompositions).
  
  =back
  
@@ -152,16 +294,6 @@ C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
  
  C<normalize> and other some functions: on request.
  
-=head2 TODO
-
-Unicode::Normalize has not been ported to EBCDIC.  The code mostly
-would work just fine but a decision needs to be made: how the module
-should work in EBCDIC?  Should the low 256 characters be understood as
-Unicode or as EBCDIC code points?  Should one be chosen or should
-there be a way to do either?  Or should such translation be left
-outside the module for the user to do, for example by using
-Encode::from_to()?
-
  =head1 AUTHOR
  
  SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
@@ -181,6 +313,10 @@ SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
  
  Unicode Normalization Forms - UAX #15
  
+=item http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProperties.txt
+
+Derived Normalization Properties
+
  =back
  
  =cut