=head1 DESCRIPTION
-The Unicode::UCD module offers a simple interface to the Unicode Character
-Database.
+The Unicode::UCD module offers a simple interface to the Unicode
+Character Database.
=cut
title titlecase equivalent mapping
block block the character belongs to (used in \p{In...})
- script script the character belongs to
+ script script the character belongs to
If no match is found, a reference to an empty hash is returned.
-The C<block> property is the same as as returned by charinfo(). It is
+The C<block> property is the same as returned by charinfo(). It is
not defined in the Unicode Character Database proper (Chapter 4 of the
Unicode 3.0 Standard, aka TUS3) but instead in an auxiliary database
(Chapter 14 of TUS3). Similarly for the C<script> property.
return;
}
-sub han_charname {
- my $arg = shift;
- my $code = _getcode($arg);
- croak __PACKAGE__, "::han_charname: unknown code '$arg'"
- unless defined $code;
- croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'"
- unless 0x3400 <= $code && $code <= 0x4DB5
- || 0x4E00 <= $code && $code <= 0x9FA5
- || 0x20000 <= $code && $code <= 0x2A6D6;
- sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code;
+# Lingua::KO::Hangul::Util not part of the standard distribution
+# but it will be used if available.
+
+eval { require Lingua::KO::Hangul::Util };
+my $hasHangulUtil = ! $@;
+if ($hasHangulUtil) {
+ Lingua::KO::Hangul::Util->import();
}
-my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG)
- "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
- "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
- );
-
-my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG)
- "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
- "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
- "YU", "EU", "YI", "I",
- );
-
-my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG)
- "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
- "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
- "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
- );
-
-my %HangulConst = (
- SBase => 0xAC00,
- LBase => 0x1100,
- VBase => 0x1161,
- TBase => 0x11A7,
- LCount => 19, # scalar @JamoL
- VCount => 21, # scalar @JamoV
- TCount => 28, # scalar @JamoT
- NCount => 588, # VCount * TCount
- SCount => 11172, # LCount * NCount
- Final => 0xD7A3, # SBase -1 + SCount
- );
-
-sub hangul_charname {
- my $arg = shift;
- my $code = _getcode($arg);
- croak __PACKAGE__, "::hangul_charname: unknown code '$arg'"
- unless defined $code;
- croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'"
- unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
- my $SIndex = $code - $HangulConst{SBase};
- my $LIndex = int( $SIndex / $HangulConst{NCount});
- my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
- my $TIndex = $SIndex % $HangulConst{TCount};
- return join('',
- "HANGUL SYLLABLE ",
- $JamoL[$LIndex],
- $JamoV[$VIndex],
- $JamoT[$TIndex],
- );
+sub hangul_decomp { # internal: called from charinfo
+ if ($hasHangulUtil) {
+ my @tmp = decomposeHangul(shift);
+ return sprintf("%04X %04X", @tmp) if @tmp == 2;
+ return sprintf("%04X %04X %04X", @tmp) if @tmp == 3;
+ }
+ return;
}
-sub hangul_decomp {
- my $arg = shift;
- my $code = _getcode($arg);
- croak __PACKAGE__, "::hangul_decomp: unknown code '$arg'"
- unless defined $code;
- croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'"
- unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final};
- my $SIndex = $code - $HangulConst{SBase};
- my $LIndex = int( $SIndex / $HangulConst{NCount});
- my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount});
- my $TIndex = $SIndex % $HangulConst{TCount};
-
- return join(" ",
- sprintf("%04X", $HangulConst{LBase} + $LIndex),
- sprintf("%04X", $HangulConst{VBase} + $VIndex),
- $TIndex ?
- sprintf("%04X", $HangulConst{TBase} + $TIndex) : (),
- );
+sub hangul_charname { # internal: called from charinfo
+ return sprintf("HANGUL SYLLABLE-%04X", shift);
+}
+
+sub han_charname { # internal: called from charinfo
+ return sprintf("CJK UNIFIED IDEOGRAPH-%04X", shift);
}
my @CharinfoRanges = (
# CJK Ideographs
[ 0x4E00, 0x9FA5, \&han_charname, undef ],
# Hangul Syllables
- [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ],
+ [ 0xAC00, 0xD7A3, $hasHangulUtil ? \&getHangulName : \&hangul_charname, \&hangul_decomp ],
# Non-Private Use High Surrogates
[ 0xD800, 0xDB7F, undef, undef ],
# Private Use High Surrogates
last;
}
}
- openunicode(\$UNICODEFH, "Unicode.txt");
+ openunicode(\$UNICODEFH, "UnicodeData.txt");
if (defined $UNICODEFH) {
use Search::Dict 1.02;
if (look($UNICODEFH, "$hexk;", { xfrm => sub { $_[0] =~ /^([^;]+);(.+)/; sprintf "%06X;$2", hex($1) } } ) >= 0) {
See also L</Blocks versus Scripts>.
-If supplied with an argument that can't be a code point, charblock()
-tries to do the opposite and interpret the argument as a character
-block. The return value is a I<range>: an anonymous list that
-contains anonymous lists, which in turn contain I<start-of-range>,
-I<end-of-range> code point pairs. You can test whether a code point
-is in a range using the L</charinrange> function. If the argument is
-not a known charater block, C<undef> is returned.
+If supplied with an argument that can't be a code point, charblock() tries
+to do the opposite and interpret the argument as a character block. The
+return value is a I<range>: an anonymous list of lists that contain
+I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
+code point is in a range using the L</charinrange> function. If the
+argument is not a known charater block, C<undef> is returned.
=cut
See also L</Blocks versus Scripts>.
-If supplied with an argument that can't be a code point, charscript()
-tries to do the opposite and interpret the argument as a character
-script. The return value is a I<range>: an anonymous list that
-contains anonymous lists, which in turn contain I<start-of-range>,
-I<end-of-range> code point pairs. You can test whether a code point
-is in a range using the L</charinrange> function. If the argument is
-not a known charater script, C<undef> is returned.
+If supplied with an argument that can't be a code point, charscript() tries
+to do the opposite and interpret the argument as a character script. The
+return value is a I<range>: an anonymous list of lists that contain
+I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
+code point is in a range using the L</charinrange> function. If the
+argument is not a known charater script, C<undef> is returned.
=cut
The difference between a block and a script is that scripts are closer
to the linguistic notion of a set of characters required to present
languages, while block is more of an artifact of the Unicode character
-numbering and separation into blocks of 256 characters.
+numbering and separation into blocks of (mostly) 256 characters.
For example the Latin B<script> is spread over several B<blocks>, such
as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
C<Latin Extended-B>. On the other hand, the Latin script does not
contain all the characters of the C<Basic Latin> block (also known as
-the ASCII): it includes only the letters, not for example the digits
+the ASCII): it includes only the letters, and not, for example, the digits
or the punctuation.
For blocks see http://www.unicode.org/Public/UNIDATA/Blocks.txt
=head2 Matching Scripts and Blocks
-Both scripts and blocks can be matched using the regular expression
-construct C<\p{In...}> and its negation C<\P{In...}>.
-
-The name of the script or the block comes after the C<In>, for example
-C<\p{InCyrillic}>, C<\P{InBasicLatin}>. Spaces and dashes ('-') are
-removed from the names for the C<\p{In...}>, for example
-C<LatinExtendedA> instead of C<Latin Extended-A>.
-
-There are a few cases where there is both a script and a block by the
-same name, in these cases the block version has C<Block> appended to
-its name: C<\p{InKatakana}> is the script, C<\p{InKatakanaBlock}> is
-the block.
+Scripts are matched with the regular-expression construct
+C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
+while C<\p{In...}> is used for blocks (e.g. C<\p{InTibetan}> matches
+any of the 256 code points in the Tibetan block).
=head2 Code Point Arguments
-A <code point argument> is either a decimal or a hexadecimal scalar
-designating a Unicode character, or "U+" followed by hexadecimals
+A I<code point argument> is either a decimal or a hexadecimal scalar
+designating a Unicode character, or C<U+> followed by hexadecimals
designating a Unicode character. Note that Unicode is B<not> limited
to 16 bits (the number of Unicode characters is open-ended, in theory
unlimited): you may have more than 4 hexdigits.
sub _compexcl {
unless (%COMPEXCL) {
- if (openunicode(\$COMPEXCLFH, "CompExcl.txt")) {
+ if (openunicode(\$COMPEXCLFH, "CompositionExclusions.txt")) {
while (<$COMPEXCLFH>) {
- if (/^([0-9A-F]+) \# /) {
+ if (/^([0-9A-F]+)\s+\#\s+/) {
my $code = hex($1);
$COMPEXCL{$code} = undef;
}
sub _casefold {
unless (%CASEFOLD) {
- if (openunicode(\$CASEFOLDFH, "CaseFold.txt")) {
+ if (openunicode(\$CASEFOLDFH, "CaseFolding.txt")) {
while (<$CASEFOLDFH>) {
if (/^([0-9A-F]+); ([CFSI]); ([0-9A-F]+(?: [0-9A-F]+)*);/) {
my $code = hex($1);
sub _casespec {
unless (%CASESPEC) {
- if (openunicode(\$CASESPECFH, "SpecCase.txt")) {
+ if (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
while (<$CASESPECFH>) {
if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; ([0-9A-F]+(?: [0-9A-F]+)*)?; (\w+(?: \w+)*)?/) {
my ($hexcode, $lower, $title, $upper, $condition) =
title
upper
condition)};
- my ($oldlocale) =
+ if (defined $oldcondition) {
+ my ($oldlocale) =
($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
- if (defined $oldlocale) {
delete $CASESPEC{$code};
$CASESPEC{$code}->{$oldlocale} =
{ code => $hexcode,
title => $oldtitle,
upper => $oldupper,
condition => $oldcondition };
- } else {
- warn __PACKAGE__, ": SpecCase.txt:", $., ": No oldlocale for 0x$hexcode\n"
}
}
my ($locale) =
The filehandle is then kept open for further queries. In other words,
if you are wondering where one of your filehandles went, that's where.
+=head1 BUGS
+
+Does not yet support EBCDIC platforms.
+
=head1 AUTHOR
Jarkko Hietaniemi