Unicode::UCD: Add charprop public function

[perl5.git] / lib / Unicode / UCD.pm
diff --git a/lib/Unicode/UCD.pm b/lib/Unicode/UCD.pm

index 393de35..b0f770a 100644 (file)
--- a/lib/Unicode/UCD.pm
+++ b/lib/Unicode/UCD.pm
@@ -15,6 +15,7 @@ our @EXPORT_OK = qw(charinfo
                     charblock charscript
                     charblocks charscripts
                     charinrange
+                   charprop
                     general_categories bidi_types
                     compexcl
                     casefold all_casefolds casespec
@@ -42,6 +43,9 @@ Unicode::UCD - Unicode character database
      use Unicode::UCD 'charinfo';
      my $charinfo   = charinfo($codepoint);
  
+    use Unicode::UCD 'charprop';
+    my $value  = charprop($codepoint, $property);
+
      use Unicode::UCD 'casefold';
      my $casefold = casefold($codepoint);
  
@@ -111,7 +115,8 @@ Character Database.
  
  Some of the functions are called with a I<code point argument>, which is either
  a decimal or a hexadecimal scalar designating a code point in the platform's
-native character set (extended to Unicode), or C<U+> followed by hexadecimals
+native character set (extended to Unicode), or a string containing C<U+>
+followed by hexadecimals
  designating a Unicode code point.  A leading 0 will force a hexadecimal
  interpretation, as will a hexadecimal digit that isn't a decimal digit.
  
@@ -120,7 +125,7 @@ Examples:
      223     # Decimal 223 in native character set
      0223    # Hexadecimal 223, native (= 547 decimal)
      0xDF    # Hexadecimal DF, native (= 223 decimal
-    U+DF    # Hexadecimal DF, in Unicode's character set
+    'U+DF'  # Hexadecimal DF, in Unicode's character set
                                (= LATIN SMALL LETTER SHARP S)
  
  Note that the largest code point in Unicode is U+10FFFF.
@@ -197,6 +202,10 @@ C<undef> is returned.
  Fields that aren't applicable to the particular code point argument exist in the
  returned hash, and are empty. 
  
+For results that are less "raw" than this function returns, or to get the values for
+any property, not just the few covered by this function, use the
+L</charprop()> function.
+
  The keys in the hash with the meanings of their values are:
  
  =over
@@ -284,30 +293,30 @@ As of Unicode 6.0, this is always empty.
  
  =item B<upper>
  
-is empty if there is no single code point uppercase mapping for I<code>
-(its uppercase mapping is itself);
-otherwise it is that mapping expressed as at least four hexdigits.
-(L</casespec()> should be used in addition to B<charinfo()>
-for case mappings when the calling program can cope with multiple code point
-mappings.)
+is, if non-empty, the uppercase mapping for I<code> expressed as at least four
+hexdigits.  This indicates that the full uppercase mapping is a single
+character, and is identical to the simple (single-character only) mapping.
+When this field is empty, it means that the simple uppercase mapping is
+I<code> itself; you'll need some other means, (like L</charprop()> or
+L</casespec()> to get the full mapping.
  
  =item B<lower>
  
-is empty if there is no single code point lowercase mapping for I<code>
-(its lowercase mapping is itself);
-otherwise it is that mapping expressed as at least four hexdigits.
-(L</casespec()> should be used in addition to B<charinfo()>
-for case mappings when the calling program can cope with multiple code point
-mappings.)
+is, if non-empty, the lowercase mapping for I<code> expressed as at least four
+hexdigits.  This indicates that the full lowercase mapping is a single
+character, and is identical to the simple (single-character only) mapping.
+When this field is empty, it means that the simple lowercase mapping is
+I<code> itself; you'll need some other means, (like L</charprop()> or
+L</casespec()> to get the full mapping.
  
  =item B<title>
  
-is empty if there is no single code point titlecase mapping for I<code>
-(its titlecase mapping is itself);
-otherwise it is that mapping expressed as at least four hexdigits.
-(L</casespec()> should be used in addition to B<charinfo()>
-for case mappings when the calling program can cope with multiple code point
-mappings.)
+is, if non-empty, the titlecase mapping for I<code> expressed as at least four
+hexdigits.  This indicates that the full titlecase mapping is a single
+character, and is identical to the simple (single-character only) mapping.
+When this field is empty, it means that the simple titlecase mapping is
+I<code> itself; you'll need some other means, (like L</charprop()> or
+L</casespec()> to get the full mapping.
  
  =item B<block>
  
@@ -602,6 +611,170 @@ sub charinrange {
      _search($range, 0, $#$range, $code);
  }
  
+=head2 B<charprop()>
+
+    use Unicode::UCD 'charprop';
+
+    print charprop(0x41, "Gc"), "\n";
+    print charprop(0x61, "General_Category"), "\n";
+
+  prints
+    Lu
+    Ll
+
+This returns the value of the Unicode property given by the second parameter
+for the  L</code point argument> given by the first.
+
+The passed-in property may be specified as any of the synonyms returned by
+L</prop_aliases()>.
+
+The return value is always a scalar, either a string or a number.  For
+properties where there are synonyms for the values, the synonym returned by
+this function is the longest, most descriptive form, the one returned by
+L</prop_value_aliases()> when called in a scalar context.  Of course, you can
+call L</prop_value_aliases()> on the result to get other synonyms.
+
+The return values are more "cooked" than the L</charinfo()> ones.  For
+example, the C<"uc"> property value is the actual string containing the full
+uppercase mapping of the input code point.  You have to go to extra trouble
+with C<charinfo> to get this value from its C<upper> hash element when the
+full mapping differs from the simple one.
+
+Special note should be made of the return values for a few properties:
+
+=over
+
+=item Block
+
+The value returned is the new-style (see L</Old-style versus new-style block
+names>).
+
+=item Decomposition_Mapping
+
+Like L</charinfo()>, the result may be an intermediate decomposition whose
+components are also decomposable.  Use L<Unicode::Normalize> to get the final
+decomposition in one step.
+
+Unlike L</charinfo()>, this does not include the decomposition type.  Use the
+C<Decomposition_Type> property to get that.
+
+=item Name_Alias
+
+If the input code point's name has more than one synonym, they are returned
+joined into a single comma-separated string.
+
+=item Numeric_Value
+
+If the result is a fraction, it is converted into a floating point number to
+the accuracy of your platform.
+
+=item Script_Extensions
+
+If the result is multiple script names, they are returned joined into a single
+comma-separated string.
+
+=back
+
+When called with a property that is a Perl extension that isn't expressible in
+a compound form, this function currently returns C<undef>, as the only two
+possible values are I<true> or I<false> (1 or 0 I suppose).  This behavior may
+change in the future, so don't write code that relies on it.  C<Present_In> is
+a Perl extension that is expressible in a bipartite or compound form (for
+example, C<\p{Present_In=4.0}>), so C<charprop> accepts it.  But C<Any> is a
+Perl extension that isn't expressible that way, so C<charprop> returns
+C<undef> for it.  Also C<charprop> returns C<undef> for all Perl extensions
+that are internal-only.
+
+=cut
+
+sub charprop ($$) {
+    my ($input_cp, $prop) = @_;
+
+    my $cp = _getcode($input_cp);
+    croak __PACKAGE__, "::charprop: unknown code point '$input_cp'" unless defined $cp;
+
+    my ($list_ref, $map_ref, $format, $default)
+                                      = prop_invmap($prop);
+    return undef unless defined $list_ref;
+
+    my $i = search_invlist($list_ref, $cp);
+    croak __PACKAGE__, "::charprop: prop_invmap return is invalid for charprop('$input_cp', '$prop)" unless defined $i;
+
+    # $i is the index into both the inversion list and map of $cp.
+    my $map = $map_ref->[$i];
+
+    # Convert enumeration values to their most complete form.
+    if (! ref $map) {
+        my $long_form = prop_value_aliases($prop, $map);
+        $map = $long_form if defined $long_form;
+    }
+
+    if ($format =~ / ^ s /x) {  # Scalars
+        return join ",", @$map if ref $map; # Convert to scalar with comma
+                                            # separated array elements
+
+        # Resolve ambiguity as to whether an all digit value is a code point
+        # that should be converted to a character, or whether it is really
+        # just a number.  To do this, look at the default.  If it is a
+        # non-empty number, we can safely assume the result is also a number.
+        if ($map =~ / ^ \d+ $ /ax && $default !~ / ^ \d+ $ /ax) {
+            $map = chr $map;
+        }
+        elsif ($map =~ / ^ (?: Y | N ) $ /x) {
+
+            # prop_invmap() returns these values for properties that are Perl
+            # extensions.  But this is misleading.  For now, return undef for
+            # these, as currently documented.
+            undef $map unless
+                exists $Unicode::UCD::prop_aliases{utf8::_loose_name(lc $prop)};
+        }
+        return $map;
+    }
+    elsif ($format eq 'ar') {   # numbers, including rationals
+        my $offset = $cp - $list_ref->[$i];
+        return $map if $map =~ /nan/i;
+        return $map + $offset if $offset != 0;  # If needs adjustment
+        return eval $map;   # Convert e.g., 1/2 to 0.5
+    }
+    elsif ($format =~ /^a/) {   # Some entries need adjusting
+
+        # Linearize sequences into a string.
+        return join "", map { chr $_ } @$map if ref $map; # XXX && $format =~ /^ a [dl] /x;
+
+        return "" if $map eq "" && $format =~ /^a.*e/;
+
+        # These are all character mappings.  Return the chr if no adjustment
+        # is needed
+        return chr $cp if $map eq "0";
+
+        # Convert special entry.
+        if ($map eq '<hangul syllable>' && $format eq 'ad') {
+            use Unicode::Normalize qw(NFD);
+            return NFD(chr $cp);
+        }
+
+        # The rest need adjustment from the first entry in the inversion list
+        # corresponding to this map.
+        my $offset = $cp - $list_ref->[$i];
+        return chr($map + $cp - $list_ref->[$i]);
+    }
+    elsif ($format eq 'n') {    # The name property
+
+        # There are two special cases, handled here.
+        if ($map =~ / ( .+ ) <code\ point> $ /x) {
+            $map = sprintf("$1%04X", $cp);
+        }
+        elsif ($map eq '<hangul syllable>') {
+            $map = charnames::viacode($cp);
+        }
+        return $map;
+    }
+    else {
+        croak __PACKAGE__, "::charprop: Internal error: unknown format '$format'.  Please perlbug this";
+        return undef;
+    }
+}
+
  =head2 B<charblock()>
  
      use Unicode::UCD 'charblock';
@@ -2070,10 +2243,10 @@ are only a few dozen possible General Categories.
  
  You can use L</prop_values()> to find out if a given property is one which has
  a restricted set of values, and if so, what those values are.  But usually
-each value actually has several synonyms.  For example, in binary properties,
-I<truth> can be represented by any of the strings "Y", "Yes", "T", or "True";
-and the General Category "Punctuation" by that string, or "Punct", or simply
-"P".
+each value actually has several synonyms.  For example, in Unicode binary
+properties, I<truth> can be represented by any of the strings "Y", "Yes", "T",
+or "True"; and the General Category "Punctuation" by that string, or "Punct",
+or simply "P".
  
  Like property names, there is typically at least a short name for each such
  property-value, and a long name.  If you know any name of the property-value
@@ -2097,7 +2270,7 @@ C<undef>.
  
  If called with a property that doesn't have synonyms for its values, it
  returns the input value, possibly normalized with capitalization and
-underscores.
+underscores, but not necessarily checking that the input value is valid.
  
  For the block property, new-style block names are returned (see
  L</Old-style versus new-style block names>).
@@ -2129,7 +2302,18 @@ sub prop_value_aliases ($$) {
      # anything, like most (if not all) string properties.  These don't have
      # synonyms anyway.  Simply return the input.  For example, there is no
      # synonym for ('Uppercase_Mapping', A').
-    return $value if ! exists $prop_value_aliases{$prop};
+    if (! exists $prop_value_aliases{$prop}) {
+
+        # Here, we have a legal property, but an unknown value.  Since the
+        # property is legal, if it isn't in the prop_aliases hash, it must be
+        # a Perl-extension All perl extensions are binary, hence are
+        # enumerateds, which means that we know that the input unknown value
+        # is illegal.
+        return if ! exists $Unicode::UCD::prop_aliases{$prop};
+
+        # Otherwise, we assume it's valid, as documented.
+        return $value;
+    }
  
      # The value name may be loosely or strictly matched; we don't know yet.
      # But both types use lower-case.
@@ -2890,6 +3074,14 @@ Use L</casefold()> for these.
  C<prop_invmap> does not know about any user-defined properties, and will
  return C<undef> if called with one of those.
  
+The returned values for the Perl extension properties, such as C<Any> and
+C<Greek> are somewhat misleading.  The values are either C<"Y"> or C<"N>".
+All Unicode properties are bipartite, so you can actually use the C<"Y"> or
+C<"N>" in a Perl regular rexpression for these, like C<qr/\p{ID_Start=Y/}> or
+C<qr/\p{Upper=N/}>.  But the Perl extensions aren't specified this way, only
+like C</qr/\p{Any}>, I<etc>.  You can't actually use the C<"Y"> and C<"N>" in
+them.
+
  =cut
  
  # User-defined properties could be handled with some changes to utf8_heavy.pl;
@@ -3530,8 +3722,15 @@ RETRY:
                  # If the overrides came from SPECIALS, the code point keys are
                  # packed UTF-8.
                  if ($overrides == $swash->{'SPECIALS'}) {
-                    $cp = unpack("C0U", $cp_maybe_utf8);
-                    @map = unpack "U0U*", $swash->{'SPECIALS'}{$cp_maybe_utf8};
+                    $cp = $cp_maybe_utf8;
+                    if (! utf8::decode($cp)) {
+                        croak __PACKAGE__, "::prop_invmap: Malformed UTF-8: ",
+                              map { sprintf("\\x{%02X}", unpack("C", $_)) }
+                                                                split "", $cp;
+                    }
+
+                    $cp = unpack("W", $cp);
+                    @map = unpack "W*", $swash->{'SPECIALS'}{$cp_maybe_utf8};
  
                      # The empty string will show up unpacked as an empty
                      # array.
@@ -3787,7 +3986,7 @@ as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
  C<Latin Extended-B>.  On the other hand, the Latin script does not
  contain all the characters of the C<Basic Latin> block (also known as
  ASCII): it includes only the letters, and not, for example, the digits
-or the punctuation.
+nor the punctuation.
  
  For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
  
@@ -3816,8 +4015,9 @@ The newer style replaces these with underscores, like this:
  
  This newer style is consistent with the values of other Unicode properties.
  To preserve backward compatibility, all the functions in Unicode::UCD that
-return block names (except one) return the old-style ones.  That one function,
-L</prop_value_aliases()> can be used to convert from old-style to new-style:
+return block names (except as noted) return the old-style ones.
+L</prop_value_aliases()> returns the new-style and can be used to convert from
+old-style to new-style:
  
   my $new_style = prop_values_aliases("block", $old_style);