Start using the Unicode 3.2.0 data (NOTE: still

[perl5.git] / lib / Unicode / UCD.pm
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm

index 841c373..96dee9a 100644 (file)
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -50,8 +50,8 @@ Unicode::UCD - Unicode character database
  
  =head1 DESCRIPTION
  
-The Unicode::UCD module offers a simple interface to the Unicode Character
-Database.
+The Unicode::UCD module offers a simple interface to the Unicode
+Character Database.
  
  =cut
  
@@ -108,11 +108,11 @@ as defined by the Unicode standard:
      title            titlecase equivalent mapping
  
      block            block the character belongs to (used in \p{In...})
-    script           script the character belongs to 
+    script           script the character belongs to
  
  If no match is found, a reference to an empty hash is returned.
  
-The C<block> property is the same as as returned by charinfo().  It is
+The C<block> property is the same as returned by charinfo().  It is
  not defined in the Unicode Character Database proper (Chapter 4 of the
  Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
  (Chapter 14 of TUS3).  Similarly for the C<script> property.
@@ -135,85 +135,30 @@ sub _getcode {
      return;
  }
  
-sub han_charname {
-    my $arg  = shift;
-    my $code = _getcode($arg);
-    croak __PACKAGE__, "::han_charname: unknown code '$arg'"
-       unless defined $code;
-    croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
-        unless 0x3400  <= $code && $code <= 0x4DB5  
-            || 0x4E00  <= $code && $code <= 0x9FA5  
-            || 0x20000 <= $code && $code <= 0x2A6D6;
-    sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
+# Lingua::KO::Hangul::Util not part of the standard distribution
+# but it will be used if available.
+
+eval { require Lingua::KO::Hangul::Util };
+my $hasHangulUtil = ! $@;
+if ($hasHangulUtil) {
+    Lingua::KO::Hangul::Util->import();
  }
  
-my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
-    "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
-    "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
-  );
-
-my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
-    "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
-    "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
-    "YU", "EU", "YI", "I",
-  );
-
-my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
-    "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
-    "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
-    "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
-  );
-
-my %HangulConst = (
-   SBase  => 0xAC00,
-   LBase  => 0x1100,
-   VBase  => 0x1161,
-   TBase  => 0x11A7,
-   LCount => 19,     # scalar @JamoL
-   VCount => 21,     # scalar @JamoV
-   TCount => 28,     # scalar @JamoT
-   NCount => 588,    # VCount * TCount
-   SCount => 11172,  # LCount * NCount
-   Final  => 0xD7A3, # SBase -1 + SCount
-  );
-
-sub hangul_charname {
-    my $arg  = shift;
-    my $code = _getcode($arg);
-    croak __PACKAGE__, "::hangul_charname: unknown code '$arg'"
-       unless defined $code;
-    croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
-        unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
-    my $SIndex = $code - $HangulConst{SBase};
-    my $LIndex = int( $SIndex / $HangulConst{NCount});
-    my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
-    my $TIndex =      $SIndex % $HangulConst{TCount};
-    return join('',
-        "HANGUL SYLLABLE ",
-        $JamoL[$LIndex],
-        $JamoV[$VIndex],
-        $JamoT[$TIndex],
-      );
+sub hangul_decomp { # internal: called from charinfo
+    if ($hasHangulUtil) {
+       my @tmp = decomposeHangul(shift);
+       return sprintf("%04X %04X",      @tmp) if @tmp == 2;
+       return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
+    }
+    return;
  }
  
-sub hangul_decomp {
-    my $arg  = shift;
-    my $code = _getcode($arg);
-    croak __PACKAGE__, "::hangul_decomp: unknown code '$arg'"
-       unless defined $code;
-    croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
-        unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
-    my $SIndex = $code - $HangulConst{SBase};
-    my $LIndex = int( $SIndex / $HangulConst{NCount});
-    my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
-    my $TIndex =      $SIndex % $HangulConst{TCount};
-
-    return join(" ",
-        sprintf("%04X", $HangulConst{LBase} + $LIndex),
-        sprintf("%04X", $HangulConst{VBase} + $VIndex),
-      $TIndex ?
-        sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
-    );
+sub hangul_charname { # internal: called from charinfo
+    return sprintf("HANGUL SYLLABLE-%04X", shift);
+}
+
+sub han_charname { # internal: called from charinfo
+    return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
  }
  
  my @CharinfoRanges = (
@@ -224,7 +169,7 @@ my @CharinfoRanges = (
  # CJK Ideographs
    [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
  # Hangul Syllables
-  [ 0xAC00,   0xD7A3,   \&hangul_charname, \&hangul_decomp  ],
+  [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
  # Non-Private Use High Surrogates
    [ 0xD800,   0xDB7F,   undef,   undef  ],
  # Private Use High Surrogates
@@ -259,7 +204,7 @@ sub charinfo {
          last;
        }
      }
-    openunicode(\$UNICODEFH, "Unicode.txt");
+    openunicode(\$UNICODEFH, "UnicodeData.txt");
      if (defined $UNICODEFH) {
         use Search::Dict 1.02;
         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
@@ -335,13 +280,12 @@ positions within all blocks are defined.
  
  See also L</Blocks versus Scripts>.
  
-If supplied with an argument that can't be a code point, charblock()
-tries to do the opposite and interpret the argument as a character
-block.  The return value is a I<range>: an anonymous list that
-contains anonymous lists, which in turn contain I<start-of-range>,
-I<end-of-range> code point pairs.  You can test whether a code point
-is in a range using the L</charinrange> function.  If the argument is
-not a known charater block, C<undef> is returned.
+If supplied with an argument that can't be a code point, charblock() tries
+to do the opposite and interpret the argument as a character block. The
+return value is a I<range>: an anonymous list of lists that contain
+I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
+code point is in a range using the L</charinrange> function. If the
+argument is not a known charater block, C<undef> is returned.
  
  =cut
  
@@ -397,13 +341,12 @@ character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
  
  See also L</Blocks versus Scripts>.
  
-If supplied with an argument that can't be a code point, charscript()
-tries to do the opposite and interpret the argument as a character
-script.  The return value is a I<range>: an anonymous list that
-contains anonymous lists, which in turn contain I<start-of-range>,
-I<end-of-range> code point pairs.  You can test whether a code point
-is in a range using the L</charinrange> function.  If the argument is
-not a known charater script, C<undef> is returned.
+If supplied with an argument that can't be a code point, charscript() tries
+to do the opposite and interpret the argument as a character script. The
+return value is a I<range>: an anonymous list of lists that contain
+I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
+code point is in a range using the L</charinrange> function. If the
+argument is not a known charater script, C<undef> is returned.
  
  =cut
  
@@ -488,13 +431,13 @@ sub charscripts {
  The difference between a block and a script is that scripts are closer
  to the linguistic notion of a set of characters required to present
  languages, while block is more of an artifact of the Unicode character
-numbering and separation into blocks of 256 characters.
+numbering and separation into blocks of (mostly) 256 characters.
  
  For example the Latin B<script> is spread over several B<blocks>, such
  as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
  C<Latin Extended-B>.  On the other hand, the Latin script does not
  contain all the characters of the C<Basic Latin> block (also known as
-the ASCII): it includes only the letters, not for example the digits
+the ASCII): it includes only the letters, and not, for example, the digits
  or the punctuation.
  
  For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
@@ -503,23 +446,15 @@ For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
  
  =head2 Matching Scripts and Blocks
  
-Both scripts and blocks can be matched using the regular expression
-construct C<\p{In...}> and its negation C<\P{In...}>.
-
-The name of the script or the block comes after the C<In>, for example
-C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
-removed from the names for the C<\p{In...}>, for example
-C<LatinExtendedA> instead of C<Latin Extended-A>.
-
-There are a few cases where there is both a script and a block by the
-same name, in these cases the block version has C<Block> appended to
-its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
-the block.
+Scripts are matched with the regular-expression construct
+C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
+while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
+any of the 256 code points in the Tibetan block).
  
  =head2 Code Point Arguments
  
-A <code point argument> is either a decimal or a hexadecimal scalar
-designating a Unicode character, or "U+" followed by hexadecimals
+A I<code point argument> is either a decimal or a hexadecimal scalar
+designating a Unicode character, or C<U+> followed by hexadecimals
  designating a Unicode character.  Note that Unicode is B<not> limited
  to 16 bits (the number of Unicode characters is open-ended, in theory
  unlimited): you may have more than 4 hexdigits.
@@ -557,9 +492,9 @@ my %COMPEXCL;
  
  sub _compexcl {
      unless (%COMPEXCL) {
-       if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
+       if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
             while (<$COMPEXCLFH>) {
-               if (/^([0-9A-F]+) \# /) {
+               if (/^([0-9A-F]+)\s+\#\s+/) {
                     my $code = hex($1);
                     $COMPEXCL{$code} = undef;
                 }
@@ -627,7 +562,7 @@ my %CASEFOLD;
  
  sub _casefold {
      unless (%CASEFOLD) {
-       if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
+       if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
             while (<$CASEFOLDFH>) {
                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
                     my $code = hex($1);
@@ -707,7 +642,7 @@ my %CASESPEC;
  
  sub _casespec {
      unless (%CASESPEC) {
-       if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
+       if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
             while (<$CASESPECFH>) {
                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
                     my ($hexcode, $lower, $title, $upper, $condition) =
@@ -723,9 +658,9 @@ sub _casespec {
                                                            title
                                                            upper
                                                            condition)};
-                           my ($oldlocale) =
+                           if (defined $oldcondition) {
+                               my ($oldlocale) =
                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
-                           if (defined $oldlocale) {
                                 delete $CASESPEC{$code};
                                 $CASESPEC{$code}->{$oldlocale} =
                                 { code      => $hexcode,
@@ -733,8 +668,6 @@ sub _casespec {
                                   title     => $oldtitle,
                                   upper     => $oldupper,
                                   condition => $oldcondition };
-                           } else {
-                               warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
                             }
                         }
                         my ($locale) =
@@ -800,6 +733,10 @@ Character Database (the database is included in the Perl distribution).
  The filehandle is then kept open for further queries.  In other words,
  if you are wondering where one of your filehandles went, that's where.
  
+=head1 BUGS
+
+Does not yet support EBCDIC platforms.
+
  =head1 AUTHOR
  
  Jarkko Hietaniemi