Start using the Unicode 3.2.0 data (NOTE: still

[perl5.git] / lib / Unicode / UCD.pm
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm

index 2cc0ece..96dee9a 100644 (file)
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -108,11 +108,11 @@ as defined by the Unicode standard:
      title            titlecase equivalent mapping
  
      block            block the character belongs to (used in \p{In...})
-    script           script the character belongs to 
+    script           script the character belongs to
  
  If no match is found, a reference to an empty hash is returned.
  
-The C<block> property is the same as as returned by charinfo().  It is
+The C<block> property is the same as returned by charinfo().  It is
  not defined in the Unicode Character Database proper (Chapter 4 of the
  Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
  (Chapter 14 of TUS3).  Similarly for the C<script> property.
@@ -135,14 +135,26 @@ sub _getcode {
      return;
  }
  
-use Lingua::KO::Hangul::Util;
+# Lingua::KO::Hangul::Util not part of the standard distribution
+# but it will be used if available.
+
+eval { require Lingua::KO::Hangul::Util };
+my $hasHangulUtil = ! $@;
+if ($hasHangulUtil) {
+    Lingua::KO::Hangul::Util->import();
+}
  
  sub hangul_decomp { # internal: called from charinfo
-  my @tmp = decomposeHangul(shift);
-  return
-    @tmp == 2 ? sprintf("%04X %04X",      @tmp) :
-    @tmp == 3 ? sprintf("%04X %04X %04X", @tmp) :
-      undef;
+    if ($hasHangulUtil) {
+       my @tmp = decomposeHangul(shift);
+       return sprintf("%04X %04X",      @tmp) if @tmp == 2;
+       return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
+    }
+    return;
+}
+
+sub hangul_charname { # internal: called from charinfo
+    return sprintf("HANGUL SYLLABLE-%04X", shift);
  }
  
  sub han_charname { # internal: called from charinfo
@@ -157,7 +169,7 @@ my @CharinfoRanges = (
  # CJK Ideographs
    [ 0x4E00,   0x9FA5,   \&han_charname,   undef  ],
  # Hangul Syllables
-  [ 0xAC00,   0xD7A3,   \&getHangulName,  \&hangul_decomp ],
+  [ 0xAC00,   0xD7A3,   $hasHangulUtil ? \&getHangulName : \&hangul_charname,  \&hangul_decomp ],
  # Non-Private Use High Surrogates
    [ 0xD800,   0xDB7F,   undef,   undef  ],
  # Private Use High Surrogates
@@ -192,7 +204,7 @@ sub charinfo {
          last;
        }
      }
-    openunicode(\$UNICODEFH, "Unicode.txt");
+    openunicode(\$UNICODEFH, "UnicodeData.txt");
      if (defined $UNICODEFH) {
         use Search::Dict 1.02;
         if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
@@ -268,13 +280,12 @@ positions within all blocks are defined.
  
  See also L</Blocks versus Scripts>.
  
-If supplied with an argument that can't be a code point, charblock()
-tries to do the opposite and interpret the argument as a character
-block.  The return value is a I<range>: an anonymous list that
-contains anonymous lists, which in turn contain I<start-of-range>,
-I<end-of-range> code point pairs.  You can test whether a code point
-is in a range using the L</charinrange> function.  If the argument is
-not a known charater block, C<undef> is returned.
+If supplied with an argument that can't be a code point, charblock() tries
+to do the opposite and interpret the argument as a character block. The
+return value is a I<range>: an anonymous list of lists that contain
+I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
+code point is in a range using the L</charinrange> function. If the
+argument is not a known charater block, C<undef> is returned.
  
  =cut
  
@@ -330,13 +341,12 @@ character belongs to, e.g.  C<Latin>, C<Greek>, C<Han>.
  
  See also L</Blocks versus Scripts>.
  
-If supplied with an argument that can't be a code point, charscript()
-tries to do the opposite and interpret the argument as a character
-script.  The return value is a I<range>: an anonymous list that
-contains anonymous lists, which in turn contain I<start-of-range>,
-I<end-of-range> code point pairs.  You can test whether a code point
-is in a range using the L</charinrange> function.  If the argument is
-not a known charater script, C<undef> is returned.
+If supplied with an argument that can't be a code point, charscript() tries
+to do the opposite and interpret the argument as a character script. The
+return value is a I<range>: an anonymous list of lists that contain
+I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
+code point is in a range using the L</charinrange> function. If the
+argument is not a known charater script, C<undef> is returned.
  
  =cut
  
@@ -421,13 +431,13 @@ sub charscripts {
  The difference between a block and a script is that scripts are closer
  to the linguistic notion of a set of characters required to present
  languages, while block is more of an artifact of the Unicode character
-numbering and separation into blocks of 256 characters.
+numbering and separation into blocks of (mostly) 256 characters.
  
  For example the Latin B<script> is spread over several B<blocks>, such
  as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
  C<Latin Extended-B>.  On the other hand, the Latin script does not
  contain all the characters of the C<Basic Latin> block (also known as
-the ASCII): it includes only the letters, not for example the digits
+the ASCII): it includes only the letters, and not, for example, the digits
  or the punctuation.
  
  For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
@@ -436,23 +446,15 @@ For scripts see UTR #24: http://www.unicode.org/unicode/reports/tr24/
  
  =head2 Matching Scripts and Blocks
  
-Both scripts and blocks can be matched using the regular expression
-construct C<\p{In...}> and its negation C<\P{In...}>.
-
-The name of the script or the block comes after the C<In>, for example
-C<\p{InCyrillic}>, C<\P{InBasicLatin}>.  Spaces and dashes ('-') are
-removed from the names for the C<\p{In...}>, for example
-C<LatinExtendedA> instead of C<Latin Extended-A>.
-
-There are a few cases where there is both a script and a block by the
-same name, in these cases the block version has C<Block> appended to
-its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
-the block.
+Scripts are matched with the regular-expression construct
+C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
+while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
+any of the 256 code points in the Tibetan block).
  
  =head2 Code Point Arguments
  
-A <code point argument> is either a decimal or a hexadecimal scalar
-designating a Unicode character, or "U+" followed by hexadecimals
+A I<code point argument> is either a decimal or a hexadecimal scalar
+designating a Unicode character, or C<U+> followed by hexadecimals
  designating a Unicode character.  Note that Unicode is B<not> limited
  to 16 bits (the number of Unicode characters is open-ended, in theory
  unlimited): you may have more than 4 hexdigits.
@@ -490,9 +492,9 @@ my %COMPEXCL;
  
  sub _compexcl {
      unless (%COMPEXCL) {
-       if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
+       if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
             while (<$COMPEXCLFH>) {
-               if (/^([0-9A-F]+) \# /) {
+               if (/^([0-9A-F]+)\s+\#\s+/) {
                     my $code = hex($1);
                     $COMPEXCL{$code} = undef;
                 }
@@ -560,7 +562,7 @@ my %CASEFOLD;
  
  sub _casefold {
      unless (%CASEFOLD) {
-       if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
+       if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
             while (<$CASEFOLDFH>) {
                 if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
                     my $code = hex($1);
@@ -640,7 +642,7 @@ my %CASESPEC;
  
  sub _casespec {
      unless (%CASESPEC) {
-       if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
+       if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
             while (<$CASESPECFH>) {
                 if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
                     my ($hexcode, $lower, $title, $upper, $condition) =
@@ -656,9 +658,9 @@ sub _casespec {
                                                            title
                                                            upper
                                                            condition)};
-                           my ($oldlocale) =
+                           if (defined $oldcondition) {
+                               my ($oldlocale) =
                                 ($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
-                           if (defined $oldlocale) {
                                 delete $CASESPEC{$code};
                                 $CASESPEC{$code}->{$oldlocale} =
                                 { code      => $hexcode,
@@ -666,8 +668,6 @@ sub _casespec {
                                   title     => $oldtitle,
                                   upper     => $oldupper,
                                   condition => $oldcondition };
-                           } else {
-                               warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
                             }
                         }
                         my ($locale) =