This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Unicode::Collate 0.28
[perl5.git] / lib / Unicode / Collate.pm
CommitLineData
45394607
JH
1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c
JH
6 }
7}
8
45394607
JH
9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
45394607
JH
15require Exporter;
16
06c8fc8f 17our $VERSION = '0.28';
45394607
JH
18our $PACKAGE = __PACKAGE__;
19
20our @ISA = qw(Exporter);
21
22our %EXPORT_TAGS = ();
23our @EXPORT_OK = ();
24our @EXPORT = ();
25
26(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
27our $KeyFile = "allkeys.txt";
28
3164dd77
TS
29our $UNICODE_VERSION;
30
327745dc
TS
31eval { require Unicode::UCD };
32
33unless ($@) {
34 $UNICODE_VERSION = Unicode::UCD::UnicodeVersion();
35}
9f1f04a1 36else { # Perl 5.6.1
3164dd77
TS
37 my($f, $fh);
38 foreach my $d (@INC) {
3164dd77
TS
39 $f = File::Spec->catfile($d, "unicode", "Unicode.301");
40 if (open($fh, $f)) {
327745dc 41 $UNICODE_VERSION = '3.0.1';
3164dd77
TS
42 close $fh;
43 last;
44 }
45 }
46}
47
4d36a948
TS
48# Perl's boolean
49use constant TRUE => 1;
50use constant FALSE => "";
51use constant NOMATCHPOS => -1;
52
53# A coderef to get combining class imported from Unicode::Normalize
54# (i.e. \&Unicode::Normalize::getCombinClass).
55# This is also used as a HAS_UNICODE_NORMALIZE flag.
06c8fc8f 56our $CVgetCombinClass;
4d36a948 57
9f1f04a1
RGS
58# Supported Levels
59use constant MinLevel => 1;
60use constant MaxLevel => 4;
61
4d36a948 62# Minimum weights at level 2 and 3, respectively
9f1f04a1
RGS
63use constant Min2Wt => 0x20;
64use constant Min3Wt => 0x02;
4d36a948
TS
65
66# Shifted weight at 4th level
9f1f04a1 67use constant Shift4Wt => 0xFFFF;
4d36a948
TS
68
69# Variable weight at 1st level.
70# This is a negative value but should be regarded as zero on collation.
71# This is for distinction of variable chars from level 3 ignorable chars.
9f1f04a1 72use constant Var1Wt => -1;
4d36a948
TS
73
74
75# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
76# PROBLEM: The Default Unicode Collation Element Table
77# has weights over 0xFFFF at the 4th level.
78# The tie-breaking in the variable weights
79# other than "shift" (as well as "shift-trimmed") is unreliable.
80use constant VCE_TEMPLATE => 'Cn4';
81
4d36a948
TS
82# A sort key: 16-bit weights
83# See also the PROBLEM on VCE_TEMPLATE above.
84use constant KEY_TEMPLATE => 'n*';
85
86# Level separator in a sort key:
87# i.e. pack(KEY_TEMPLATE, 0)
88use constant LEVEL_SEP => "\0\0";
89
90# As Unicode code point separator for hash keys.
91# A joined code point string (denoted by JCPS below)
92# like "65;768" is used for internal processing
93# instead of Perl's Unicode string like "\x41\x{300}",
94# as the native code point is different from the Unicode code point
95# on EBCDIC platform.
96# This character must not be included in any stringified
97# representation of an integer.
98use constant CODE_SEP => ';';
99
100# boolean values of variable weights
0116f5dc
JH
101use constant NON_VAR => 0; # Non-Variable character
102use constant VAR => 1; # Variable character
3164dd77 103
4d36a948
TS
104# Logical_Order_Exception in PropList.txt
105# TODO: synchronization with change of PropList.txt.
0116f5dc 106our $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 107
0116f5dc 108sub UCA_Version { "9" }
a7fbee98 109
0116f5dc 110sub Base_Unicode_Version { $UNICODE_VERSION || 'unknown' }
a7fbee98 111
9f1f04a1
RGS
112######
113
9f1f04a1 114sub pack_U {
ae6aa562 115 return pack('U*', @_);
9f1f04a1
RGS
116}
117
118sub unpack_U {
ae6aa562 119 return unpack('U*', pack('U*').shift);
9f1f04a1
RGS
120}
121
122######
123
0116f5dc
JH
124my (%AlternateOK);
125@AlternateOK{ qw/
126 blanked non-ignorable shifted shift-trimmed
127 / } = ();
128
129our @ChangeOK = qw/
130 alternate backwards level normalization rearrange
131 katakana_before_hiragana upper_before_lower
132 overrideHangul overrideCJK preprocess UCA_Version
133 /;
134
135our @ChangeNG = qw/
9f1f04a1 136 entry entries table maxlength
0116f5dc
JH
137 ignoreChar ignoreName undefChar undefName
138 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
4d36a948 139 derivCode normCode rearrangeHash L3_ignorable
9f1f04a1 140 backwardsFlag
0116f5dc 141 /;
9f1f04a1
RGS
142# The hash key 'ignored' is deleted at v 0.21.
143# The hash key 'isShift' is deleted at v 0.23.
144# The hash key 'combining' is deleted at v 0.24.
0116f5dc
JH
145
146my (%ChangeOK, %ChangeNG);
147@ChangeOK{ @ChangeOK } = ();
148@ChangeNG{ @ChangeNG } = ();
149
150sub change {
151 my $self = shift;
152 my %hash = @_;
153 my %old;
154 foreach my $k (keys %hash) {
155 if (exists $ChangeOK{$k}) {
156 $old{$k} = $self->{$k};
157 $self->{$k} = $hash{$k};
158 }
159 elsif (exists $ChangeNG{$k}) {
160 croak "change of $k via change() is not allowed!";
161 }
162 # else => ignored
163 }
164 $self->checkCollator;
165 return wantarray ? %old : $self;
166}
a7fbee98 167
9f1f04a1
RGS
168sub _checkLevel {
169 my $level = shift;
170 my $key = shift;
171 croak sprintf "Illegal level %d (in \$self->{%s}) lower than %d.",
172 $level, $key, MinLevel if MinLevel > $level;
173 croak sprintf "Unsupported level %d (in \$self->{%s}) higher than %d ",
174 $level, $key, MaxLevel if MaxLevel < $level;
175}
176
0116f5dc
JH
177sub checkCollator {
178 my $self = shift;
9f1f04a1 179 _checkLevel($self->{level}, "level");
a7fbee98 180
0116f5dc 181 $self->{derivCode} =
4d36a948
TS
182 $self->{UCA_Version} == 8 ? \&_derivCE_8 :
183 $self->{UCA_Version} == 9 ? \&_derivCE_9 :
0116f5dc 184 croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 185
0116f5dc
JH
186 $self->{alternate} = lc($self->{alternate});
187 croak "$PACKAGE unknown alternate tag name: $self->{alternate}"
188 unless exists $AlternateOK{ $self->{alternate} };
189
9f1f04a1
RGS
190 if (! defined $self->{backwards}) {
191 $self->{backwardsFlag} = 0;
192 }
193 elsif (! ref $self->{backwards}) {
194 _checkLevel($self->{backwards}, "backwards");
195 $self->{backwardsFlag} = 1 << $self->{backwards};
196 }
197 else {
198 my %level;
199 $self->{backwardsFlag} = 0;
200 for my $b (@{ $self->{backwards} }) {
201 _checkLevel($b, "backwards");
202 $level{$b} = 1;
203 }
204 for my $v (sort keys %level) {
205 $self->{backwardsFlag} += 1 << $v;
206 }
207 }
0116f5dc
JH
208
209 $self->{rearrange} = []
210 if ! defined $self->{rearrange};
211 croak "$PACKAGE: A list for rearrangement must be store in an ARRAYREF"
212 if ! ref $self->{rearrange};
213
214 # keys of $self->{rearrangeHash} are $self->{rearrange}.
215 $self->{rearrangeHash} = undef;
216
217 if (@{ $self->{rearrange} }) {
218 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
219 }
220
221 $self->{normCode} = undef;
a7fbee98
JH
222
223 if (defined $self->{normalization}) {
224 eval { require Unicode::Normalize };
225 croak "Unicode/Normalize.pm is required to normalize strings: $@"
226 if $@;
227
06c8fc8f
RGS
228 $CVgetCombinClass = \&Unicode::Normalize::getCombinClass
229 if ! $CVgetCombinClass;
a7fbee98 230
06c8fc8f
RGS
231 if ($self->{normalization} ne 'prenormalized') {
232 my $norm = $self->{normalization};
233 $self->{normCode} = sub {
1d2654e1
JH
234 Unicode::Normalize::normalize($norm, shift);
235 };
06c8fc8f
RGS
236 eval { $self->{normCode}->("") }; # try
237 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 238 }
a7fbee98 239 }
0116f5dc
JH
240 return;
241}
242
243sub new
244{
245 my $class = shift;
246 my $self = bless { @_ }, $class;
45394607 247
a7fbee98 248 # If undef is passed explicitly, no file is read.
0116f5dc
JH
249 $self->{table} = $KeyFile if ! exists $self->{table};
250 $self->read_table if defined $self->{table};
905aa9f0 251
a7fbee98
JH
252 if ($self->{entry}) {
253 $self->parseEntry($_) foreach split /\n/, $self->{entry};
254 }
905aa9f0 255
9f1f04a1 256 $self->{level} ||= MaxLevel;
0116f5dc 257 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 258
0116f5dc
JH
259 $self->{overrideHangul} = ''
260 if ! exists $self->{overrideHangul};
261 $self->{overrideCJK} = ''
262 if ! exists $self->{overrideCJK};
06c8fc8f 263 $self->{normalization} = 'NFD'
0116f5dc
JH
264 if ! exists $self->{normalization};
265 $self->{alternate} = $self->{alternateTable} || 'shifted'
266 if ! exists $self->{alternate};
267 $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
a7fbee98 268 if ! exists $self->{rearrange};
0116f5dc
JH
269 $self->{backwards} = $self->{backwardsTable}
270 if ! exists $self->{backwards};
a7fbee98 271
0116f5dc 272 $self->checkCollator;
a7fbee98
JH
273
274 return $self;
275}
905aa9f0
TS
276
277sub read_table {
a7fbee98
JH
278 my $self = shift;
279 my $file = $self->{table} ne '' ? $self->{table} : $KeyFile;
280
e69a2255
JH
281 my $filepath = File::Spec->catfile($Path, $file);
282 open my $fk, "<$filepath"
283 or croak "File does not exist at $filepath";
a7fbee98
JH
284
285 while (<$fk>) {
286 next if /^\s*#/;
287 if (/^\s*\@/) {
0116f5dc
JH
288 if (/^\s*\@version\s*(\S*)/) {
289 $self->{versionTable} ||= $1;
290 }
291 elsif (/^\s*\@alternate\s+(\S*)/) {
292 $self->{alternateTable} ||= $1;
a7fbee98 293 }
0116f5dc
JH
294 elsif (/^\s*\@backwards\s+(\S*)/) {
295 push @{ $self->{backwardsTable} }, $1;
a7fbee98 296 }
0116f5dc
JH
297 elsif (/^\s*\@forwards\s+(\S*)/) { # parhaps no use
298 push @{ $self->{forwardsTable} }, $1;
a7fbee98 299 }
0116f5dc
JH
300 elsif (/^\s*\@rearrange\s+(.*)/) { # (\S*) is NG
301 push @{ $self->{rearrangeTable} }, _getHexArray($1);
a7fbee98
JH
302 }
303 next;
304 }
305 $self->parseEntry($_);
45394607 306 }
a7fbee98 307 close $fk;
45394607
JH
308}
309
905aa9f0 310
45394607
JH
311##
312## get $line, parse it, and write an entry in $self
313##
314sub parseEntry
315{
a7fbee98
JH
316 my $self = shift;
317 my $line = shift;
4d36a948 318 my($name, $entry, @uv, @key);
a7fbee98
JH
319
320 return if $line !~ /^\s*[0-9A-Fa-f]/;
321
322 # removes comment and gets name
323 $name = $1
324 if $line =~ s/[#%]\s*(.*)//;
325 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
326
327 # gets element
328 my($e, $k) = split /;/, $line;
329 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
330 if ! $k;
331
4d36a948
TS
332 @uv = _getHexArray($e);
333 return if !@uv;
334
335 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 336
4d36a948 337 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 338 my $ele = pack_U(@uv);
a7fbee98 339
4d36a948
TS
340 # regarded as if it were not entried in the table
341 return
342 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 343
4d36a948
TS
344 # replaced as completely ignorable
345 $k = '[.0000.0000.0000.0000]'
346 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 347 }
0116f5dc 348
4d36a948
TS
349 # replaced as completely ignorable
350 $k = '[.0000.0000.0000.0000]'
351 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
352
4c843366 353 my $is_L3_ignorable = TRUE;
4d36a948 354
caffd4cf
TS
355 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
356 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948
TS
357 my @wt = _getHexArray($arr);
358 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366
JH
359 $is_L3_ignorable = FALSE
360 if $wt[0] + $wt[1] + $wt[2] != 0;
4d36a948
TS
361 # if $arr !~ /[1-9A-Fa-f]/; NG
362 # Conformance Test shows L3-ignorable is completely ignorable.
4c843366
JH
363 # For expansion, an entry $is_L3_ignorable
364 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 365 }
caffd4cf 366
4d36a948 367 $self->{entries}{$entry} = \@key;
caffd4cf 368
4d36a948
TS
369 $self->{L3_ignorable}{$uv[0]} = TRUE
370 if @uv == 1 && $is_L3_ignorable;
caffd4cf 371
4d36a948
TS
372 # Contraction is to be considered in the range of this maxlength.
373 $self->{maxlength}{$uv[0]} = scalar @uv
374 if @uv > 1;
45394607
JH
375}
376
9f1f04a1 377
45394607 378##
9f1f04a1 379## arrayref[weights] = altCE(VCE)
45394607 380##
d16e9e3d 381sub altCE
45394607 382{
a7fbee98 383 my $self = shift;
4d36a948 384 my($var, @wt) = unpack(VCE_TEMPLATE, shift);
a7fbee98
JH
385
386 $self->{alternate} eq 'blanked' ?
9f1f04a1 387 $var ? [Var1Wt, 0, 0, $wt[3]] : \@wt :
a7fbee98 388 $self->{alternate} eq 'non-ignorable' ?
4d36a948 389 \@wt :
a7fbee98 390 $self->{alternate} eq 'shifted' ?
9f1f04a1
RGS
391 $var ? [Var1Wt, 0, 0, $wt[0] ]
392 : [ @wt[0..2], $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0 ] :
a7fbee98 393 $self->{alternate} eq 'shift-trimmed' ?
9f1f04a1 394 $var ? [Var1Wt, 0, 0, $wt[0] ] : [ @wt[0..2], 0 ] :
a7fbee98 395 croak "$PACKAGE unknown alternate name: $self->{alternate}";
45394607
JH
396}
397
45394607
JH
398sub viewSortKey
399{
a7fbee98 400 my $self = shift;
9f1f04a1
RGS
401 $self->visualizeSortKey($self->getSortKey(@_));
402}
0116f5dc 403
9f1f04a1
RGS
404sub visualizeSortKey
405{
406 my $self = shift;
407 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 408
9f1f04a1 409 if ($self->{UCA_Version} <= 8) {
0116f5dc
JH
410 $view =~ s/ ?0000 ?/|/g;
411 } else {
412 $view =~ s/\b0000\b/|/g;
413 }
a7fbee98 414 return "[$view]";
45394607
JH
415}
416
d16e9e3d 417
45394607 418##
4d36a948
TS
419## arrayref of JCPS = splitCE(string to be collated)
420## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitCE(string, true)
45394607 421##
d16e9e3d 422sub splitCE
45394607 423{
a7fbee98 424 my $self = shift;
4d36a948
TS
425 my $wLen = $_[1];
426
a7fbee98 427 my $code = $self->{preprocess};
0116f5dc 428 my $norm = $self->{normCode};
a7fbee98
JH
429 my $ent = $self->{entries};
430 my $max = $self->{maxlength};
431 my $reH = $self->{rearrangeHash};
4d36a948 432 my $ign = $self->{L3_ignorable};
0116f5dc 433 my $ver9 = $self->{UCA_Version} > 8;
a7fbee98 434
4d36a948 435 my ($str, @buf);
a7fbee98 436
4d36a948
TS
437 if ($wLen) {
438 $code and croak "Preprocess breaks character positions. "
439 . "Don't use with index(), match(), etc.";
440 $norm and croak "Normalization breaks character positions. "
441 . "Don't use with index(), match(), etc.";
442 $str = $_[0];
443 }
444 else {
445 $str = $_[0];
446 $str = &$code($str) if ref $code;
447 $str = &$norm($str) if ref $norm;
448 }
a7fbee98 449
4d36a948 450 # get array of Unicode code point of string.
9f1f04a1 451 my @src = unpack_U($str);
4d36a948
TS
452
453 # rearrangement:
454 # Character positions are not kept if rearranged,
455 # then neglected if $wLen is true.
456 if ($reH && ! $wLen) {
a7fbee98
JH
457 for (my $i = 0; $i < @src; $i++) {
458 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
459 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
460 $i++;
461 }
462 }
45394607 463 }
45394607 464
0116f5dc 465 if ($ver9) {
4d36a948
TS
466 # To remove a character marked as a completely ignorable.
467 for (my $i = 0; $i < @src; $i++) {
468 $src[$i] = undef if $ign->{ $src[$i] };
469 }
0116f5dc
JH
470 }
471
a7fbee98 472 for (my $i = 0; $i < @src; $i++) {
4d36a948
TS
473 next if _isNonCharacter($src[$i]);
474
475 my $i_orig = $i;
476 my $ce = $src[$i];
477
478 if ($max->{$ce}) { # contract
479 my $temp_ce = $ce;
1d2654e1
JH
480 my $ceLen = 1;
481 my $maxLen = $max->{$ce};
4d36a948 482
1d2654e1 483 for (my $p = $i + 1; $ceLen < $maxLen && $p < @src; $p++) {
4d36a948
TS
484 next if ! defined $src[$p];
485 $temp_ce .= CODE_SEP . $src[$p];
1d2654e1 486 $ceLen++;
4d36a948
TS
487 if ($ent->{$temp_ce}) {
488 $ce = $temp_ce;
489 $i = $p;
490 }
491 }
4d36a948 492
06c8fc8f
RGS
493 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
494 # This process requires Unicode::Normalize.
495 # If "normalize" is undef, here should be skipped *always*
496 # (in spite of bool value of $CVgetCombinClass),
497 # since canonical ordering cannot be expected.
498 # Blocked combining character should not be contracted.
499
500 if ($self->{normalization})
501 # $self->{normCode} is false in the case of "prenormalized".
502 {
503 my $preCC = 0;
504 my $curCC = 0;
505
506 for (my $p = $i + 1; $p < @src; $p++) {
507 next if ! defined $src[$p];
508 $curCC = $CVgetCombinClass->($src[$p]);
509 last unless $curCC;
510 my $tail = CODE_SEP . $src[$p];
511 if ($preCC != $curCC && $ent->{$ce.$tail}) {
512 $ce .= $tail;
513 $src[$p] = undef;
514 } else {
515 $preCC = $curCC;
516 }
4d36a948 517 }
a7fbee98 518 }
a7fbee98
JH
519 }
520
4d36a948
TS
521 if ($wLen) {
522 for (my $p = $i + 1; $p < @src; $p++) {
523 last if defined $src[$p];
524 $i = $p;
a7fbee98
JH
525 }
526 }
4d36a948
TS
527
528 push @buf, $wLen ? [$ce, $i_orig, $i + 1] : $ce;
45394607 529 }
4d36a948 530 return \@buf;
d16e9e3d 531}
45394607 532
d16e9e3d
JH
533
534##
4d36a948 535## list of arrayrefs of weights = getWt(JCPS)
d16e9e3d
JH
536##
537sub getWt
538{
a7fbee98 539 my $self = shift;
4d36a948 540 my $ce = shift;
a7fbee98 541 my $ent = $self->{entries};
0116f5dc 542 my $der = $self->{derivCode};
a7fbee98 543
4d36a948
TS
544 return if !defined $ce;
545 return map($self->altCE($_), @{ $ent->{$ce} })
546 if $ent->{$ce};
0116f5dc 547
4d36a948
TS
548 # CE must not be a contraction, then it's a code point.
549 my $u = $ce;
a7fbee98 550
4d36a948 551 if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale
1d2654e1
JH
552 my $hang = $self->{overrideHangul};
553 my @hangulCE;
554 if ($hang) {
555 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
556 }
557 elsif (!defined $hang) {
558 @hangulCE = $der->($u);
559 }
560 else {
561 my $max = $self->{maxlength};
562 my @decH = _decompHangul($u);
563
564 if (@decH == 2) {
565 my $contract = join(CODE_SEP, @decH);
566 @decH = ($contract) if $ent->{$contract};
567 } else { # must be <@decH == 3>
568 if ($max->{$decH[0]}) {
569 my $contract = join(CODE_SEP, @decH);
570 if ($ent->{$contract}) {
571 @decH = ($contract);
572 } else {
573 $contract = join(CODE_SEP, @decH[0,1]);
574 $ent->{$contract} and @decH = ($contract, $decH[2]);
575 }
576 # even if V's ignorable, LT contraction is not supported.
577 # If such a situatution were required, NFD should be used.
578 }
579 if (@decH == 3 && $max->{$decH[1]}) {
580 my $contract = join(CODE_SEP, @decH[1,2]);
581 $ent->{$contract} and @decH = ($decH[0], $contract);
582 }
583 }
584
585 @hangulCE = map({
586 $ent->{$_} ? @{ $ent->{$_} } : $der->($_);
587 } @decH);
588 }
589 return map $self->altCE($_), @hangulCE;
a7fbee98
JH
590 }
591 elsif (0x3400 <= $u && $u <= 0x4DB5 ||
592 0x4E00 <= $u && $u <= 0x9FA5 ||
1d2654e1
JH
593 0x20000 <= $u && $u <= 0x2A6D6) # CJK Ideograph
594 {
595 my $cjk = $self->{overrideCJK};
0116f5dc
JH
596 return map $self->altCE($_),
597 $cjk
4d36a948 598 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
0116f5dc 599 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
9f1f04a1 600 ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
0116f5dc 601 : $der->($u);
a7fbee98
JH
602 }
603 else {
0116f5dc 604 return map $self->altCE($_), $der->($u);
a7fbee98 605 }
d16e9e3d
JH
606}
607
d16e9e3d
JH
608
609##
610## string sortkey = getSortKey(string arg)
611##
612sub getSortKey
613{
a7fbee98
JH
614 my $self = shift;
615 my $lev = $self->{level};
4d36a948 616 my $rCE = $self->splitCE(shift); # get an arrayref of JCPS
0116f5dc 617 my $ver9 = $self->{UCA_Version} > 8;
4d36a948 618 my $v2i = $self->{alternate} ne 'non-ignorable';
a7fbee98
JH
619
620 # weight arrays
0116f5dc
JH
621 my (@buf, $last_is_variable);
622
4d36a948
TS
623 foreach my $wt (map $self->getWt($_), @$rCE) {
624 if ($v2i && $ver9) {
625 if ($wt->[0] == 0) { # ignorable
626 next if $last_is_variable;
0116f5dc 627 } else {
9f1f04a1 628 $last_is_variable = ($wt->[0] == Var1Wt);
0116f5dc
JH
629 }
630 }
4d36a948 631 push @buf, $wt;
0116f5dc 632 }
a7fbee98
JH
633
634 # make sort key
635 my @ret = ([],[],[],[]);
636 foreach my $v (0..$lev-1) {
637 foreach my $b (@buf) {
4d36a948
TS
638 push @{ $ret[$v] }, $b->[$v]
639 if 0 < $b->[$v];
a7fbee98
JH
640 }
641 }
45394607 642
a7fbee98
JH
643 # modification of tertiary weights
644 if ($self->{upper_before_lower}) {
645 foreach (@{ $ret[2] }) {
646 if (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
647 elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
648 elsif ($_ == 0x1C) { $_ += 1 } # square upper
649 elsif ($_ == 0x1D) { $_ -= 1 } # square lower
650 }
45394607 651 }
a7fbee98
JH
652 if ($self->{katakana_before_hiragana}) {
653 foreach (@{ $ret[2] }) {
654 if (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
655 elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
656 }
45394607 657 }
9f1f04a1
RGS
658
659 if ($self->{backwardsFlag}) {
660 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
661 if ($self->{backwardsFlag} & (1 << $v)) {
662 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
663 }
664 }
665 }
666
4d36a948 667 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607
JH
668}
669
670
671##
d16e9e3d 672## int compare = cmp(string a, string b)
45394607 673##
5398038e
TS
674sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
675sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
676sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
677sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
678sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
679sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
680sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607
JH
681
682##
d16e9e3d 683## list[strings] sorted = sort(list[strings] arg)
45394607 684##
a7fbee98
JH
685sub sort {
686 my $obj = shift;
687 return
688 map { $_->[1] }
689 sort{ $a->[0] cmp $b->[0] }
690 map [ $obj->getSortKey($_), $_ ], @_;
45394607
JH
691}
692
0116f5dc 693
4d36a948 694sub _derivCE_9 {
0116f5dc
JH
695 my $u = shift;
696 my $base =
4d36a948
TS
697 (0x4E00 <= $u && $u <= 0x9FA5)
698 ? 0xFB40 : # CJK
0116f5dc 699 (0x3400 <= $u && $u <= 0x4DB5 || 0x20000 <= $u && $u <= 0x2A6D6)
4d36a948
TS
700 ? 0xFB80 # CJK ext.
701 : 0xFBC0; # others
0116f5dc
JH
702
703 my $aaaa = $base + ($u >> 15);
704 my $bbbb = ($u & 0x7FFF) | 0x8000;
705 return
9f1f04a1
RGS
706 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
707 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc
JH
708}
709
4d36a948 710sub _derivCE_8 {
0116f5dc
JH
711 my $code = shift;
712 my $aaaa = 0xFF80 + ($code >> 15);
713 my $bbbb = ($code & 0x7FFF) | 0x8000;
714 return
4d36a948
TS
715 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
716 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607
JH
717}
718
719##
720## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
721##
a7fbee98 722sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 723
a7fbee98 724#
4d36a948 725# $code *must* be in Hangul syllable.
a7fbee98
JH
726# Check it before you enter here.
727#
5398038e
TS
728sub _decompHangul {
729 my $code = shift;
5398038e
TS
730 my $SIndex = $code - 0xAC00;
731 my $LIndex = int( $SIndex / 588);
732 my $VIndex = int(($SIndex % 588) / 28);
733 my $TIndex = $SIndex % 28;
734 return (
a7fbee98
JH
735 0x1100 + $LIndex,
736 0x1161 + $VIndex,
737 $TIndex ? (0x11A7 + $TIndex) : (),
5398038e 738 );
45394607
JH
739}
740
4d36a948
TS
741sub _isNonCharacter {
742 my $code = shift;
743 return ! defined $code # removed
744 || ($code < 0 || 0x10FFFF < $code) # out of range
745 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
746 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
747 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
748 ;
749}
750
751
752##
753## bool _nonIgnorAtLevel(arrayref weights, int level)
754##
755sub _nonIgnorAtLevel($$)
756{
757 my $wt = shift;
758 return if ! defined $wt;
759 my $lv = shift;
9f1f04a1 760 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948
TS
761}
762
763##
764## bool _eqArray(
765## arrayref of arrayref[weights] source,
766## arrayref of arrayref[weights] substr,
767## int level)
768## * comparison of graphemes vs graphemes.
769## @$source >= @$substr must be true (check it before call this);
770##
771sub _eqArray($$$)
772{
773 my $source = shift;
774 my $substr = shift;
775 my $lev = shift;
776
777 for my $g (0..@$substr-1){
778 # Do the $g'th graphemes have the same number of AV weigths?
779 return if @{ $source->[$g] } != @{ $substr->[$g] };
780
781 for my $w (0..@{ $substr->[$g] }-1) {
782 for my $v (0..$lev-1) {
783 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
784 }
785 }
786 }
787 return 1;
788}
789
790##
791## (int position, int length)
792## int position = index(string, substring, position, [undoc'ed grobal])
793##
794## With "grobal" (only for the list context),
795## returns list of arrayref[position, length].
796##
797sub index
798{
799 my $self = shift;
800 my $str = shift;
801 my $len = length($str);
802 my $subCE = $self->splitCE(shift);
803 my $pos = @_ ? shift : 0;
804 $pos = 0 if $pos < 0;
805 my $grob = shift;
806
4d36a948
TS
807 my $lev = $self->{level};
808 my $ver9 = $self->{UCA_Version} > 8;
809 my $v2i = $self->{alternate} ne 'non-ignorable';
810
811 if (! @$subCE) {
812 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
813 return $grob
814 ? map([$_, 0], $temp..$len)
815 : wantarray ? ($temp,0) : $temp;
816 }
817 if ($len < $pos) {
818 return wantarray ? () : NOMATCHPOS;
819 }
820 my $strCE = $self->splitCE($pos ? substr($str, $pos) : $str, TRUE);
821 if (! @$strCE) {
822 return wantarray ? () : NOMATCHPOS;
823 }
824 my $last_is_variable;
825 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
826
827 $last_is_variable = FALSE;
828 for my $wt (map $self->getWt($_), @$subCE) {
829 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
830
831 if ($v2i && $ver9) {
832 if ($wt->[0] == 0) {
833 $to_be_pushed = FALSE if $last_is_variable;
834 } else {
9f1f04a1 835 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948
TS
836 }
837 }
838
839 if (@subWt && $wt->[0] == 0) {
840 push @{ $subWt[-1] }, $wt if $to_be_pushed;
841 } else {
9f1f04a1 842 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948
TS
843 push @subWt, [ $wt ];
844 }
845 }
846
847 my $count = 0;
848 my $end = @$strCE - 1;
849
850 $last_is_variable = FALSE;
851
852 for (my $i = 0; $i <= $end; ) { # no $i++
853 my $found_base = 0;
854
855 # fetch a grapheme
856 while ($i <= $end && $found_base == 0) {
857 for my $wt ($self->getWt($strCE->[$i][0])) {
858 my $to_be_pushed = _nonIgnorAtLevel($wt,$lev);
859
860 if ($v2i && $ver9) {
861 if ($wt->[0] == 0) {
862 $to_be_pushed = FALSE if $last_is_variable;
863 } else {
9f1f04a1 864 $last_is_variable = ($wt->[0] == Var1Wt);
4d36a948
TS
865 }
866 }
867
868 if (@strWt && $wt->[0] == 0) {
869 push @{ $strWt[-1] }, $wt if $to_be_pushed;
870 $finPos[-1] = $strCE->[$i][2];
871 } elsif ($to_be_pushed) {
9f1f04a1 872 $wt->[0] = 0 if $wt->[0] == Var1Wt;
4d36a948
TS
873 push @strWt, [ $wt ];
874 push @iniPos, $found_base ? NOMATCHPOS : $strCE->[$i][1];
875 $finPos[-1] = NOMATCHPOS if $found_base;
876 push @finPos, $strCE->[$i][2];
877 $found_base++;
878 }
879 # else ===> no-op
880 }
881 $i++;
882 }
883
884 # try to match
885 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
886 if ($iniPos[0] != NOMATCHPOS &&
887 $finPos[$#subWt] != NOMATCHPOS &&
888 _eqArray(\@strWt, \@subWt, $lev)) {
889 my $temp = $iniPos[0] + $pos;
890
891 if ($grob) {
892 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
893 splice @strWt, 0, $#subWt;
894 splice @iniPos, 0, $#subWt;
895 splice @finPos, 0, $#subWt;
896 }
897 else {
898 return wantarray
899 ? ($temp, $finPos[$#subWt] - $iniPos[0])
900 : $temp;
901 }
902 }
903 shift @strWt;
904 shift @iniPos;
905 shift @finPos;
906 }
907 }
908
909 return $grob
910 ? @g_ret
911 : wantarray ? () : NOMATCHPOS;
912}
913
914##
915## scalarref to matching part = match(string, substring)
916##
917sub match
918{
919 my $self = shift;
920 if (my($pos,$len) = $self->index($_[0], $_[1])) {
921 my $temp = substr($_[0], $pos, $len);
922 return wantarray ? $temp : \$temp;
923 # An lvalue ref \substr should be avoided,
924 # since its value is affected by modification of its referent.
925 }
926 else {
927 return;
928 }
929}
930
931##
932## arrayref matching parts = gmatch(string, substring)
933##
934sub gmatch
935{
936 my $self = shift;
937 my $str = shift;
938 my $sub = shift;
939 return map substr($str, $_->[0], $_->[1]),
940 $self->index($str, $sub, 0, 'g');
941}
942
943##
944## bool subst'ed = subst(string, substring, replace)
945##
946sub subst
947{
948 my $self = shift;
949 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
950
951 if (my($pos,$len) = $self->index($_[0], $_[1])) {
952 if ($code) {
953 my $mat = substr($_[0], $pos, $len);
954 substr($_[0], $pos, $len, $code->($mat));
955 } else {
956 substr($_[0], $pos, $len, $_[2]);
957 }
958 return TRUE;
959 }
960 else {
961 return FALSE;
962 }
963}
964
965##
966## int count = gsubst(string, substring, replace)
967##
968sub gsubst
969{
970 my $self = shift;
971 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
972 my $cnt = 0;
973
974 # Replacement is carried out from the end, then use reverse.
975 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
976 if ($code) {
977 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
978 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
979 } else {
980 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
981 }
982 $cnt++;
983 }
984 return $cnt;
985}
986
45394607
JH
9871;
988__END__
989
990=head1 NAME
991
a7fbee98 992Unicode::Collate - Unicode Collation Algorithm
45394607
JH
993
994=head1 SYNOPSIS
995
996 use Unicode::Collate;
997
998 #construct
5398038e 999 $Collator = Unicode::Collate->new(%tailoring);
45394607
JH
1000
1001 #sort
5398038e 1002 @sorted = $Collator->sort(@not_sorted);
45394607
JH
1003
1004 #compare
a7fbee98 1005 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607
JH
1006
1007=head1 DESCRIPTION
1008
4d36a948
TS
1009This module is an implementation
1010of Unicode Technical Standard #10 (UTS #10)
1011"Unicode Collation Algorithm."
1012
45394607
JH
1013=head2 Constructor and Tailoring
1014
d16e9e3d
JH
1015The C<new> method returns a collator object.
1016
5398038e 1017 $Collator = Unicode::Collate->new(
0116f5dc 1018 UCA_Version => $UCA_Version,
45394607
JH
1019 alternate => $alternate,
1020 backwards => $levelNumber, # or \@levelNumbers
1021 entry => $element,
1022 normalization => $normalization_form,
1023 ignoreName => qr/$ignoreName/,
1024 ignoreChar => qr/$ignoreChar/,
1025 katakana_before_hiragana => $bool,
1026 level => $collationLevel,
1027 overrideCJK => \&overrideCJK,
1028 overrideHangul => \&overrideHangul,
1029 preprocess => \&preprocess,
1030 rearrange => \@charList,
1031 table => $filename,
1032 undefName => qr/$undefName/,
1033 undefChar => qr/$undefChar/,
1034 upper_before_lower => $bool,
1035 );
a7fbee98 1036 # if %tailoring is false (i.e. empty),
5398038e 1037 # $Collator should do the default collation.
45394607
JH
1038
1039=over 4
1040
0116f5dc
JH
1041=item UCA_Version
1042
1043If the version number of the older UCA is given,
1044the older behavior of that version is emulated on collating.
1045If omitted, the return value of C<UCA_Version()> is used.
1046
1047The supported version: 8 or 9.
1048
1049B<This parameter may be removed in the future version,
1050as switching the algorithm would affect the performance.>
1051
45394607
JH
1052=item alternate
1053
4d36a948 1054-- see 3.2.2 Variable Weighting, UTS #10.
caffd4cf
TS
1055
1056(the title in UCA version 8: Alternate Weighting)
45394607 1057
a7fbee98
JH
1058This key allows to alternate weighting for variable collation elements,
1059which are marked with an ASTERISK in the table
1060(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1061
1062 alternate => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
45394607 1063
a7fbee98 1064These names are case-insensitive.
45394607
JH
1065By default (if specification is omitted), 'shifted' is adopted.
1066
4d36a948 1067 'Blanked' Variable elements are made ignorable at levels 1 through 3;
a7fbee98
JH
1068 considered at the 4th level.
1069
1070 'Non-ignorable' Variable elements are not reset to ignorable.
1071
4d36a948 1072 'Shifted' Variable elements are made ignorable at levels 1 through 3
a7fbee98
JH
1073 their level 4 weight is replaced by the old level 1 weight.
1074 Level 4 weight for Non-Variable elements is 0xFFFF.
1075
1076 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1077 are trimmed.
1078
45394607
JH
1079=item backwards
1080
4d36a948 1081-- see 3.1.2 French Accents, UTS #10.
45394607
JH
1082
1083 backwards => $levelNumber or \@levelNumbers
1084
1085Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1086If omitted, forwards at all the levels.
1087
1088=item entry
1089
4d36a948 1090-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1091
a7fbee98 1092Overrides a default order or defines additional collation elements
45394607
JH
1093
1094 entry => <<'ENTRIES', # use the UCA file format
a7fbee98 109500E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a><e>
45394607
JH
10960063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
10970043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
1098ENTRIES
1099
4d36a948
TS
1100B<NOTE:> The code point in the UCA file format (before C<';'>)
1101B<must> be a Unicode code point, but not a native code point.
1102So C<0063> must always denote C<U+0063>,
1103but not a character of C<"\x63">.
1104
45394607
JH
1105=item ignoreName
1106
1107=item ignoreChar
1108
4d36a948 1109-- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
45394607 1110
caffd4cf
TS
1111Makes the entry in the table completely ignorable;
1112i.e. as if the weights were zero at all level.
45394607 1113
a7fbee98 1114E.g. when 'a' and 'e' are ignorable,
45394607
JH
1115'element' is equal to 'lament' (or 'lmnt').
1116
1117=item level
1118
4d36a948 1119-- see 4.3 Form a sort key for each string, UTS #10.
45394607
JH
1120
1121Set the maximum level.
1122Any higher levels than the specified one are ignored.
1123
1124 Level 1: alphabetic ordering
1125 Level 2: diacritic ordering
1126 Level 3: case ordering
1127 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
1128
1129 ex.level => 2,
1130
a7fbee98
JH
1131If omitted, the maximum is the 4th.
1132
45394607
JH
1133=item normalization
1134
4d36a948 1135-- see 4.1 Normalize each input string, UTS #10.
45394607 1136
905aa9f0 1137If specified, strings are normalized before preparation of sort keys
45394607
JH
1138(the normalization is executed after preprocess).
1139
1d2654e1
JH
1140A form name C<Unicode::Normalize::normalize()> accepts will be applied
1141as C<$normalization_form>.
06c8fc8f 1142Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1
JH
1143See C<Unicode::Normalize::normalize()> for detail.
1144If omitted, C<'NFD'> is used.
45394607 1145
1d2654e1 1146L<normalization> is performed after L<preprocess> (if defined).
45394607 1147
06c8fc8f
RGS
1148Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1149though they are not concerned with C<Unicode::Normalize::normalize()>.
1150
1151If C<undef> (not a string C<"undef">) is passed explicitly
1152as the value for this key,
45394607
JH
1153any normalization is not carried out (this may make tailoring easier
1154if any normalization is not desired).
06c8fc8f
RGS
1155Under C<(normalization =E<gt> undef)>, only contiguous contractions
1156are resolved; e.g. C<A-cedilla-ring> would be primary equal to C<A>,
1157even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>.
1158In this point,
1159C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1160B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1161
1162In the case of C<(normalization =E<gt> "prenormalized")>,
1163any normalization is not performed, but
1164non-contiguous contractions with combining characters are performed.
1165Therefore
1166C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1167B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1168If source strings are finely prenormalized,
1169C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1170
1171Except C<(normalization =E<gt> undef)>,
1172B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607
JH
1173
1174=item overrideCJK
1175
4d36a948 1176-- see 7.1 Derived Collation Elements, UTS #10.
45394607
JH
1177
1178By default, mapping of CJK Unified Ideographs
a7fbee98
JH
1179uses the Unicode codepoint order.
1180But the mapping of CJK Unified Ideographs may be overrided.
45394607 1181
a7fbee98 1182ex. CJK Unified Ideographs in the JIS code point order.
45394607
JH
1183
1184 overrideCJK => sub {
a7fbee98
JH
1185 my $u = shift; # get a Unicode codepoint
1186 my $b = pack('n', $u); # to UTF-16BE
1187 my $s = your_unicode_to_sjis_converter($b); # convert
1188 my $n = unpack('n', $s); # convert sjis to short
1189 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607
JH
1190 },
1191
a7fbee98
JH
1192ex. ignores all CJK Unified Ideographs.
1193
1194 overrideCJK => sub {()}, # CODEREF returning empty list
1195
1196 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1197 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1198
1199If C<undef> is passed explicitly as the value for this key,
1200weights for CJK Unified Ideographs are treated as undefined.
1201But assignment of weight for CJK Unified Ideographs
1202in table or L<entry> is still valid.
1203
1204=item overrideHangul
1205
4d36a948 1206-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98
JH
1207
1208By default, Hangul Syllables are decomposed into Hangul Jamo.
1209But the mapping of Hangul Syllables may be overrided.
1210
1211This tag works like L<overrideCJK>, so see there for examples.
1212
45394607
JH
1213If you want to override the mapping of Hangul Syllables,
1214the Normalization Forms D and KD are not appropriate
1215(they will be decomposed before overriding).
1216
a7fbee98
JH
1217If C<undef> is passed explicitly as the value for this key,
1218weight for Hangul Syllables is treated as undefined
1219without decomposition into Hangul Jamo.
1220But definition of weight for Hangul Syllables
1221in table or L<entry> is still valid.
1222
45394607
JH
1223=item preprocess
1224
4d36a948 1225-- see 5.1 Preprocessing, UTS #10.
45394607
JH
1226
1227If specified, the coderef is used to preprocess
1228before the formation of sort keys.
1229
a7fbee98 1230ex. dropping English articles, such as "a" or "the".
45394607
JH
1231Then, "the pen" is before "a pencil".
1232
1233 preprocess => sub {
1234 my $str = shift;
a7fbee98 1235 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1236 return $str;
45394607
JH
1237 },
1238
1d2654e1
JH
1239L<preprocess> is performed before L<normalization> (if defined).
1240
45394607
JH
1241=item rearrange
1242
4d36a948 1243-- see 3.1.3 Rearrangement, UTS #10.
45394607
JH
1244
1245Characters that are not coded in logical order and to be rearranged.
a7fbee98 1246By default,
45394607
JH
1247
1248 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1249
a7fbee98
JH
1250If you want to disallow any rearrangement,
1251pass C<undef> or C<[]> (a reference to an empty list)
1252as the value for this key.
1253
0116f5dc
JH
1254B<According to the version 9 of UCA, this parameter shall not be used;
1255but it is not warned at present.>
1256
45394607
JH
1257=item table
1258
4d36a948 1259-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607
JH
1260
1261You can use another element table if desired.
ae6aa562
JH
1262The table file must be put into a directory
1263where F<Unicode/Collate.pm> is installed.
1264E.g. in F<perl/lib/Unicode/Collate> directory
1265when you have F<perl/lib/Unicode/Collate.pm>.
45394607 1266
ae6aa562 1267By default, the filename F<"allkeys.txt"> is used.
45394607 1268
a7fbee98
JH
1269If C<undef> is passed explicitly as the value for this key,
1270no file is read (but you can define collation elements via L<entry>).
1271
1272A typical way to define a collation element table
1273without any file of table:
1274
1275 $onlyABC = Unicode::Collate->new(
1276 table => undef,
1277 entry => << 'ENTRIES',
12780061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
12790041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
12800062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
12810042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
12820063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
12830043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1284ENTRIES
1285 );
905aa9f0 1286
45394607
JH
1287=item undefName
1288
1289=item undefChar
1290
4d36a948 1291-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607
JH
1292
1293Undefines the collation element as if it were unassigned in the table.
1294This reduces the size of the table.
1295If an unassigned character appears in the string to be collated,
1296the sort key is made from its codepoint
1297as a single-character collation element,
1298as it is greater than any other assigned collation elements
1299(in the codepoint order among the unassigned characters).
1300But, it'd be better to ignore characters
1301unfamiliar to you and maybe never used.
1302
1303=item katakana_before_hiragana
1304
1305=item upper_before_lower
1306
4d36a948 1307-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
45394607
JH
1308
1309By default, lowercase is before uppercase
1310and hiragana is before katakana.
1311
a7fbee98
JH
1312If the tag is made true, this is reversed.
1313
1314B<NOTE>: These tags simplemindedly assume
1315any lowercase/uppercase or hiragana/katakana distinctions
9f1f04a1
RGS
1316must occur in level 3, and their weights at level 3
1317must be same as those mentioned in 7.3.1, UTS #10.
1318If you define your collation elements which violate this requirement,
4d36a948 1319these tags don't work validly.
45394607
JH
1320
1321=back
1322
3164dd77 1323=head2 Methods for Collation
45394607
JH
1324
1325=over 4
1326
5398038e 1327=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607
JH
1328
1329Sorts a list of strings.
1330
5398038e 1331=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607
JH
1332
1333Returns 1 (when C<$a> is greater than C<$b>)
1334or 0 (when C<$a> is equal to C<$b>)
1335or -1 (when C<$a> is lesser than C<$b>).
1336
5398038e
TS
1337=item C<$result = $Collator-E<gt>eq($a, $b)>
1338
1339=item C<$result = $Collator-E<gt>ne($a, $b)>
1340
1341=item C<$result = $Collator-E<gt>lt($a, $b)>
1342
1343=item C<$result = $Collator-E<gt>le($a, $b)>
1344
1345=item C<$result = $Collator-E<gt>gt($a, $b)>
1346
1347=item C<$result = $Collator-E<gt>ge($a, $b)>
1348
a7fbee98 1349They works like the same name operators as theirs.
5398038e
TS
1350
1351 eq : whether $a is equal to $b.
1352 ne : whether $a is not equal to $b.
1353 lt : whether $a is lesser than $b.
1354 le : whether $a is lesser than $b or equal to $b.
1355 gt : whether $a is greater than $b.
1356 ge : whether $a is greater than $b or equal to $b.
1357
1358=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1359
4d36a948 1360-- see 4.3 Form a sort key for each string, UTS #10.
45394607
JH
1361
1362Returns a sort key.
1363
1364You compare the sort keys using a binary comparison
1365and get the result of the comparison of the strings using UCA.
1366
5398038e 1367 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607
JH
1368
1369 is equivalent to
1370
5398038e 1371 $Collator->cmp($a, $b)
45394607 1372
a7fbee98
JH
1373=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1374
a7fbee98
JH
1375 use Unicode::Collate;
1376 my $c = Unicode::Collate->new();
1377 print $c->viewSortKey("Perl"),"\n";
1378
0116f5dc
JH
1379 # output:
1380 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1381 # Level 1 Level 2 Level 3 Level 4
1382
1383 (If C<UCA_Version> is 8, the output is slightly different.)
a7fbee98 1384
4d36a948
TS
1385=back
1386
1387=head2 Methods for Searching
d16e9e3d 1388
4d36a948
TS
1389B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1390for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1391C<subst>, C<gsubst>) is croaked,
1392as the position and the length might differ
1393from those on the specified string.
1394(And the C<rearrange> tag is neglected.)
d16e9e3d 1395
4d36a948
TS
1396The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1397like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1398but they are not aware of any pattern, but only a literal substring.
1399
1400=over 4
1401
1402=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1403
1404=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d
JH
1405
1406If C<$substring> matches a part of C<$string>, returns
1407the position of the first occurrence of the matching part in scalar context;
1408in list context, returns a two-element list of
1409the position and the length of the matching part.
1410
d16e9e3d
JH
1411If C<$substring> does not match any part of C<$string>,
1412returns C<-1> in scalar context and
1413an empty list in list context.
1414
1415e.g. you say
1416
5398038e 1417 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948
TS
1418 # (normalization => undef) is REQUIRED.
1419 my $str = "Ich muß studieren Perl.";
1420 my $sub = "MÜSS";
d16e9e3d 1421 my $match;
a7fbee98 1422 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1423 $match = substr($str, $pos, $len);
d16e9e3d
JH
1424 }
1425
4d36a948
TS
1426and get C<"muß"> in C<$match> since C<"muß">
1427is primary equal to C<"MÜSS">.
1428
1429=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1430
1431=item C<($match) = $Collator-E<gt>match($string, $substring)>
1432
1433If C<$substring> matches a part of C<$string>, in scalar context, returns
1434B<a reference to> the first occurrence of the matching part
1435(C<$match_ref> is always true if matches,
1436since every reference is B<true>);
1437in list context, returns the first occurrence of the matching part.
1438
1439If C<$substring> does not match any part of C<$string>,
1440returns C<undef> in scalar context and
1441an empty list in list context.
1442
1443e.g.
1444
1445 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1446 print "matches [$$match_ref].\n";
1447 } else {
1448 print "doesn't match.\n";
1449 }
1450
1451 or
1452
1453 if (($match) = $Collator->match($str, $sub)) { # list context
1454 print "matches [$match].\n";
1455 } else {
1456 print "doesn't match.\n";
1457 }
1458
1459=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1460
1461If C<$substring> matches a part of C<$string>, returns
1462all the matching parts (or matching count in scalar context).
1463
1464If C<$substring> does not match any part of C<$string>,
1465returns an empty list.
1466
1467=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1468
1469If C<$substring> matches a part of C<$string>,
1470the first occurrence of the matching part is replaced by C<$replacement>
1471(C<$string> is modified) and return C<$count> (always equals to C<1>).
1472
1473C<$replacement> can be a C<CODEREF>,
1474taking the matching part as an argument,
1475and returning a string to replace the matching part
1476(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1477
1478=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1479
1480If C<$substring> matches a part of C<$string>,
1481all the occurrences of the matching part is replaced by C<$replacement>
1482(C<$string> is modified) and return C<$count>.
1483
1484C<$replacement> can be a C<CODEREF>,
1485taking the matching part as an argument,
1486and returning a string to replace the matching part
1487(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1488
1489e.g.
1490
1491 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1492 # (normalization => undef) is REQUIRED.
1493 my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1494 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1495
1496 # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1497 # i.e., all the camels are made bold-faced.
d16e9e3d 1498
45394607
JH
1499=back
1500
3164dd77
TS
1501=head2 Other Methods
1502
1503=over 4
1504
0116f5dc
JH
1505=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1506
1507Change the value of specified keys and returns the changed part.
1508
1509 $Collator = Unicode::Collate->new(level => 4);
1510
1511 $Collator->eq("perl", "PERL"); # false
1512
1513 %old = $Collator->change(level => 2); # returns (level => 4).
1514
1515 $Collator->eq("perl", "PERL"); # true
1516
1517 $Collator->change(%old); # returns (level => 2).
1518
1519 $Collator->eq("perl", "PERL"); # false
1520
1521Not all C<(key,value)>s are allowed to be changed.
1522See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1523
1524In the scalar context, returns the modified collator
1525(but it is B<not> a clone from the original).
1526
1527 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1528
1529 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1530
1531 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1532
3164dd77
TS
1533=item UCA_Version
1534
4d36a948 1535Returns the version number of UTS #10 this module consults.
3164dd77
TS
1536
1537=item Base_Unicode_Version
1538
1539Returns the version number of the Unicode Standard
1540this module is based on.
1541
1542=back
1543
45394607
JH
1544=head2 EXPORT
1545
1546None by default.
1547
1548=head2 CAVEAT
1549
1550Use of the C<normalization> parameter requires
1551the B<Unicode::Normalize> module.
1552
5398038e 1553If you need not it (say, in the case when you need not
45394607
JH
1554handle any combining characters),
1555assign C<normalization =E<gt> undef> explicitly.
1556
4d36a948 1557-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1558
0116f5dc
JH
1559=head2 Conformance Test
1560
1561The Conformance Test for the UCA is provided
1562in L<http://www.unicode.org/reports/tr10/CollationTest.html>
1563and L<http://www.unicode.org/reports/tr10/CollationTest.zip>
1564
1565For F<CollationTest_SHIFTED.txt>,
1566a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1567for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1568C<Unicode::Collate-E<gt>new(alternate =E<gt> "non-ignorable", level =E<gt> 3)>.
1569
4d36a948 1570B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1571
45394607
JH
1572=head1 AUTHOR
1573
1d2654e1 1574SADAHIRO Tomoyuki, <SADAHIRO@cpan.org>
45394607
JH
1575
1576 http://homepage1.nifty.com/nomenclator/perl/
1577
ae6aa562 1578 Copyright(C) 2001-2003, SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1579
a7fbee98
JH
1580 This library is free software; you can redistribute it
1581 and/or modify it under the same terms as Perl itself.
45394607
JH
1582
1583=head1 SEE ALSO
1584
1585=over 4
1586
0116f5dc 1587=item http://www.unicode.org/reports/tr10/
45394607 1588
4d36a948 1589Unicode Collation Algorithm - UTS #10
45394607 1590
0116f5dc 1591=item http://www.unicode.org/reports/tr10/allkeys.txt
a7fbee98
JH
1592
1593The Default Unicode Collation Element Table
45394607 1594
0116f5dc
JH
1595=item http://www.unicode.org/reports/tr10/CollationTest.html
1596http://www.unicode.org/reports/tr10/CollationTest.zip
1597
1598The latest versions of the conformance test for the UCA
1599
1600=item http://www.unicode.org/reports/tr15/
a7fbee98
JH
1601
1602Unicode Normalization Forms - UAX #15
1603
a7fbee98 1604=item L<Unicode::Normalize>
45394607 1605
45394607
JH
1606=back
1607
1608=cut