This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add a simple Unicode character database interface, Unicode::UCD.
[perl5.git] / lib / Unicode / UCD.pm
CommitLineData
561c79ed
JH
1package Unicode::UCD;
2
3use strict;
4use warnings;
5
6our $VERSION = v3.1.0;
7
8require Exporter;
9
10our @ISA = qw(Exporter);
11our @EXPORT_OK = qw(charinfo charblock);
12
13use Carp;
14
15=head1 NAME
16
17Unicode - Unicode character database
18
19=head1 SYNOPSIS
20
21 use Unicode::UCD 3.1.0;
22 # requires that level of the Unicode character database
23
24 use Unicode::UCD 'charinfo';
25 my %charinfo = charinfo($codepoint);
26
27 use Unicode::UCD 'charblock';
28 my $charblock = charblock($codepoint);
29
30=head1 DESCRIPTION
31
32The Unicode module offers a simple interface to the Unicode Character
33Database.
34
35=cut
36
37my $UNICODE;
38my $BLOCKS;
39
40sub openunicode {
41 my ($rfh, @path) = @_;
42 my $f;
43 unless (defined $$rfh) {
44 for my $d (@INC) {
45 use File::Spec;
46 $f = File::Spec->catfile($d, "unicode", @path);
47 if (open($$rfh, $f)) {
48 last;
49 } else {
50 croak __PACKAGE__, ": open '$f' failed: $!\n";
51 }
52 }
53 croak __PACKAGE__, ": failed to find ",join("/",@path)," in @INC\n"
54 unless defined $rfh;
55 }
56 return $f;
57}
58
59=head2 charinfo
60
61 use Unicode::UCD 'charinfo';
62
63 my %charinfo = charinfo(0x41);
64
65charinfo() returns a hash that has the following fields as defined
66by the Unicode standard:
67
68 key
69
70 code code point with at least four hexdigits
71 name name of the character IN UPPER CASE
72 category general category of the character
73 combining classes used in the Canonical Ordering Algorithm
74 bidi bidirectional category
75 decomposition character decomposition mapping
76 decimal if decimal digit this is the integer numeric value
77 digit if digit this is the numeric value
78 numeric if numeric is the integer or rational numeric value
79 mirrored if mirrored in bidirectional text
80 unicode10 Unicode 1.0 name if existed and different
81 comment ISO 10646 comment field
82 upper uppercase equivalent mapping
83 lower lowercase equivalent mapping
84 title titlecase equivalent mapping
85 block block the character belongs to (used in \p{In...})
86
87If no match is found, an empty hash is returned.
88
89The C<block> property is the same as as returned by charinfo().
90(It is not defined in the Unicode Character Database proper but
91instead in an auxiliary database.)
92
93=cut
94
95sub charinfo {
96 my $code = shift;
97 my $hexk = sprintf("%04X", $code);
98
99 openunicode(\$UNICODE, "Unicode.txt");
100 if (defined $UNICODE) {
101 use Search::Dict;
102 if (look($UNICODE, "$hexk;") >= 0) {
103 my $line = <$UNICODE>;
104 chomp $line;
105 my %prop;
106 @prop{qw(
107 code name category
108 combining bidi decomposition
109 decimal digit numeric
110 mirrored unicode10 comment
111 upper lower title
112 )} = split(/;/, $line, -1);
113 if ($prop{code} eq $hexk) {
114 $prop{block} = charblock($code);
115 return %prop;
116 }
117 }
118 }
119 return;
120}
121
122=head2 charbloc
123
124 use Unicode::UCD 'charblock';
125
126 my $charblock = charblock(0x41);
127
128charblock() returns the block the character belongs to, e.g.
129C<Basic Latin>. Note that not all the character positions within all
130block are defined.
131
132The name is the same name that is used in the C<\p{In...}> construct,
133for example C<\p{InBasicLatin}> (spaces and dashes ('-') are squished
134away from the names for the C<\p{In...}>.
135
136=cut
137
138my @BLOCKS;
139
140sub _charblock {
141 my ($code, $lo, $hi) = @_;
142
143 return if $lo > $hi;
144
145 my $mid = int(($lo+$hi) / 2);
146
147 if ($BLOCKS[$mid]->[0] < $code) {
148 if ($BLOCKS[$mid]->[1] >= $code) {
149 return $BLOCKS[$mid]->[2];
150 } else {
151 _charblock($code, $mid + 1, $hi);
152 }
153 } elsif ($BLOCKS[$mid]->[0] > $code) {
154 _charblock($code, $lo, $mid - 1);
155 } else {
156 return $BLOCKS[$mid]->[2];
157 }
158}
159
160sub charblock {
161 my $code = shift;
162
163 unless (@BLOCKS) {
164 if (openunicode(\$BLOCKS, "Blocks.pl")) {
165 while (<$BLOCKS>) {
166 if (/^([0-9A-F]+)\s+([0-9A-F]+)\s+(.+)/) {
167 push @BLOCKS, [ hex($1), hex($2), $3 ];
168 }
169 }
170 close($BLOCKS);
171 }
172 }
173
174 _charblock($code, 0, $#BLOCKS);
175}
176
177=head1 AUTHOR
178
179Jarkko Hietaniemi
180
181=cut
182
1831;