This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update Unicode-Collate to CPAN version 0.71
[perl5.git] / cpan / Unicode-Collate / Collate.pm
CommitLineData
45394607
JH
1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c
JH
6 }
7}
8
45394607
JH
9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
10d7ec48
NC
15no warnings 'utf8';
16
cac3df65 17our $VERSION = '0.71';
45394607
JH
18our $PACKAGE = __PACKAGE__;
19
211cc501
CBW
20require DynaLoader;
21our @ISA = qw(DynaLoader);
22bootstrap Unicode::Collate $VERSION;
23
e7f779c8
RGS
24my @Path = qw(Unicode Collate);
25my $KeyFile = "allkeys.txt";
45394607 26
4d36a948
ST
27# Perl's boolean
28use constant TRUE => 1;
29use constant FALSE => "";
30use constant NOMATCHPOS => -1;
31
32# A coderef to get combining class imported from Unicode::Normalize
33# (i.e. \&Unicode::Normalize::getCombinClass).
34# This is also used as a HAS_UNICODE_NORMALIZE flag.
e7f779c8 35my $CVgetCombinClass;
4d36a948 36
9f1f04a1
RGS
37# Supported Levels
38use constant MinLevel => 1;
39use constant MaxLevel => 4;
40
4d36a948 41# Minimum weights at level 2 and 3, respectively
9f1f04a1
RGS
42use constant Min2Wt => 0x20;
43use constant Min3Wt => 0x02;
4d36a948
ST
44
45# Shifted weight at 4th level
9f1f04a1 46use constant Shift4Wt => 0xFFFF;
4d36a948 47
4d36a948
ST
48# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
49# PROBLEM: The Default Unicode Collation Element Table
50# has weights over 0xFFFF at the 4th level.
51# The tie-breaking in the variable weights
52# other than "shift" (as well as "shift-trimmed") is unreliable.
53use constant VCE_TEMPLATE => 'Cn4';
54
4d36a948
ST
55# A sort key: 16-bit weights
56# See also the PROBLEM on VCE_TEMPLATE above.
57use constant KEY_TEMPLATE => 'n*';
58
59# Level separator in a sort key:
60# i.e. pack(KEY_TEMPLATE, 0)
61use constant LEVEL_SEP => "\0\0";
62
63# As Unicode code point separator for hash keys.
64# A joined code point string (denoted by JCPS below)
65# like "65;768" is used for internal processing
66# instead of Perl's Unicode string like "\x41\x{300}",
67# as the native code point is different from the Unicode code point
68# on EBCDIC platform.
69# This character must not be included in any stringified
70# representation of an integer.
71use constant CODE_SEP => ';';
72
73# boolean values of variable weights
0116f5dc
JH
74use constant NON_VAR => 0; # Non-Variable character
75use constant VAR => 1; # Variable character
3164dd77 76
91ae00cb 77# specific code points
00e00351
CBW
78use constant Hangul_SIni => 0xAC00;
79use constant Hangul_SFin => 0xD7A3;
584e761d 80
4d36a948 81# Logical_Order_Exception in PropList.txt
e7f779c8 82my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 83
00e00351 84sub UCA_Version { "20" }
a7fbee98 85
00e00351 86sub Base_Unicode_Version { "5.2.0" }
a7fbee98 87
9f1f04a1
RGS
88######
89
9f1f04a1 90sub pack_U {
ae6aa562 91 return pack('U*', @_);
9f1f04a1
RGS
92}
93
9f1f04a1
RGS
94######
95
91ae00cb
NC
96my (%VariableOK);
97@VariableOK{ qw/
0116f5dc 98 blanked non-ignorable shifted shift-trimmed
91ae00cb 99 / } = (); # keys lowercased
0116f5dc
JH
100
101our @ChangeOK = qw/
102 alternate backwards level normalization rearrange
103 katakana_before_hiragana upper_before_lower
104 overrideHangul overrideCJK preprocess UCA_Version
91ae00cb 105 hangul_terminator variable
0116f5dc
JH
106 /;
107
108our @ChangeNG = qw/
b5d9a953 109 entry mapping table maxlength contraction
91ae00cb 110 ignoreChar ignoreName undefChar undefName variableTable
0116f5dc 111 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
00e00351 112 derivCode normCode rearrangeHash backwardsFlag
aa7758f7 113 suppress suppressHash
211cc501 114 __useXS
0116f5dc 115 /;
9f1f04a1
RGS
116# The hash key 'ignored' is deleted at v 0.21.
117# The hash key 'isShift' is deleted at v 0.23.
118# The hash key 'combining' is deleted at v 0.24.
91ae00cb 119# The hash key 'entries' is deleted at v 0.30.
e7f779c8 120# The hash key 'L3_ignorable' is deleted at v 0.40.
91ae00cb
NC
121
122sub version {
123 my $self = shift;
124 return $self->{versionTable} || 'unknown';
125}
0116f5dc
JH
126
127my (%ChangeOK, %ChangeNG);
128@ChangeOK{ @ChangeOK } = ();
129@ChangeNG{ @ChangeNG } = ();
130
131sub change {
132 my $self = shift;
133 my %hash = @_;
134 my %old;
91ae00cb
NC
135 if (exists $hash{variable} && exists $hash{alternate}) {
136 delete $hash{alternate};
137 }
138 elsif (!exists $hash{variable} && exists $hash{alternate}) {
139 $hash{variable} = $hash{alternate};
140 }
0116f5dc
JH
141 foreach my $k (keys %hash) {
142 if (exists $ChangeOK{$k}) {
143 $old{$k} = $self->{$k};
144 $self->{$k} = $hash{$k};
145 }
146 elsif (exists $ChangeNG{$k}) {
147 croak "change of $k via change() is not allowed!";
148 }
149 # else => ignored
150 }
3756e7ca 151 $self->checkCollator();
0116f5dc
JH
152 return wantarray ? %old : $self;
153}
a7fbee98 154
9f1f04a1
RGS
155sub _checkLevel {
156 my $level = shift;
abd1ec54
NC
157 my $key = shift; # 'level' or 'backwards'
158 MinLevel <= $level or croak sprintf
159 "Illegal level %d (in value for key '%s') lower than %d.",
160 $level, $key, MinLevel;
161 $level <= MaxLevel or croak sprintf
162 "Unsupported level %d (in value for key '%s') higher than %d.",
163 $level, $key, MaxLevel;
9f1f04a1
RGS
164}
165
91ae00cb
NC
166my %DerivCode = (
167 8 => \&_derivCE_8,
168 9 => \&_derivCE_9,
169 11 => \&_derivCE_9, # 11 == 9
3756e7ca 170 14 => \&_derivCE_14,
74b94a79
CBW
171 16 => \&_derivCE_14, # 16 == 14
172 18 => \&_derivCE_18,
00e00351 173 20 => \&_derivCE_20,
b5d9a953 174 22 => \&_derivCE_22,
91ae00cb
NC
175);
176
0116f5dc
JH
177sub checkCollator {
178 my $self = shift;
9f1f04a1 179 _checkLevel($self->{level}, "level");
a7fbee98 180
91ae00cb
NC
181 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
182 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 183
91ae00cb 184 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 185 $self->{alternateTable} || 'shifted';
91ae00cb
NC
186 $self->{variable} = $self->{alternate} = lc($self->{variable});
187 exists $VariableOK{ $self->{variable} }
3756e7ca 188 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
0116f5dc 189
9f1f04a1
RGS
190 if (! defined $self->{backwards}) {
191 $self->{backwardsFlag} = 0;
192 }
193 elsif (! ref $self->{backwards}) {
194 _checkLevel($self->{backwards}, "backwards");
195 $self->{backwardsFlag} = 1 << $self->{backwards};
196 }
197 else {
198 my %level;
199 $self->{backwardsFlag} = 0;
200 for my $b (@{ $self->{backwards} }) {
201 _checkLevel($b, "backwards");
202 $level{$b} = 1;
203 }
204 for my $v (sort keys %level) {
205 $self->{backwardsFlag} += 1 << $v;
206 }
207 }
0116f5dc 208
91ae00cb
NC
209 defined $self->{rearrange} or $self->{rearrange} = [];
210 ref $self->{rearrange}
211 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc
JH
212
213 # keys of $self->{rearrangeHash} are $self->{rearrange}.
214 $self->{rearrangeHash} = undef;
215
216 if (@{ $self->{rearrange} }) {
217 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
218 }
219
220 $self->{normCode} = undef;
a7fbee98
JH
221
222 if (defined $self->{normalization}) {
223 eval { require Unicode::Normalize };
91ae00cb 224 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 225
91ae00cb 226 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 227
91ae00cb
NC
228 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
229 $self->{normCode} = \&Unicode::Normalize::NFD;
230 }
231 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f
RGS
232 my $norm = $self->{normalization};
233 $self->{normCode} = sub {
1d2654e1
JH
234 Unicode::Normalize::normalize($norm, shift);
235 };
06c8fc8f
RGS
236 eval { $self->{normCode}->("") }; # try
237 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 238 }
a7fbee98 239 }
0116f5dc
JH
240 return;
241}
242
243sub new
244{
245 my $class = shift;
246 my $self = bless { @_ }, $class;
45394607 247
211cc501
CBW
248 if (! exists $self->{table} &&
249 !defined $self->{undefName} && !defined $self->{ignoreName} &&
250 !defined $self->{undefChar} && !defined $self->{ignoreChar}) {
251 $self->{__useXS} = \&_fetch_simple;
252 } # XS only
253
aa7758f7
CBW
254 # keys of $self->{suppressHash} are $self->{suppress}.
255 if ($self->{suppress} && @{ $self->{suppress} }) {
256 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
257 } # before read_table()
258
a7fbee98 259 # If undef is passed explicitly, no file is read.
0116f5dc 260 $self->{table} = $KeyFile if ! exists $self->{table};
3756e7ca 261 $self->read_table() if defined $self->{table};
905aa9f0 262
a7fbee98 263 if ($self->{entry}) {
e7f779c8
RGS
264 while ($self->{entry} =~ /([^\n]+)/g) {
265 $self->parseEntry($1);
266 }
a7fbee98 267 }
905aa9f0 268
9f1f04a1 269 $self->{level} ||= MaxLevel;
0116f5dc 270 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 271
abd1ec54 272 $self->{overrideHangul} = FALSE
0116f5dc 273 if ! exists $self->{overrideHangul};
abd1ec54 274 $self->{overrideCJK} = FALSE
0116f5dc 275 if ! exists $self->{overrideCJK};
06c8fc8f 276 $self->{normalization} = 'NFD'
0116f5dc 277 if ! exists $self->{normalization};
3756e7ca
RGS
278 $self->{rearrange} = $self->{rearrangeTable} ||
279 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
a7fbee98 280 if ! exists $self->{rearrange};
0116f5dc
JH
281 $self->{backwards} = $self->{backwardsTable}
282 if ! exists $self->{backwards};
a7fbee98 283
3756e7ca 284 $self->checkCollator();
a7fbee98
JH
285
286 return $self;
287}
905aa9f0 288
00e00351
CBW
289sub parseAtmark {
290 my $self = shift;
291 my $line = shift; # after s/^\s*\@//
292
293 if ($line =~ /^version\s*(\S*)/) {
294 $self->{versionTable} ||= $1;
295 }
296 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
297 $self->{variableTable} ||= $1;
298 }
299 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
300 $self->{alternateTable} ||= $1;
301 }
302 elsif ($line =~ /^backwards\s+(\S*)/) {
303 push @{ $self->{backwardsTable} }, $1;
304 }
305 elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
306 push @{ $self->{forwardsTable} }, $1;
307 }
308 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
309 push @{ $self->{rearrangeTable} }, _getHexArray($1);
310 }
311}
312
905aa9f0 313sub read_table {
a7fbee98 314 my $self = shift;
a7fbee98 315
211cc501
CBW
316 if ($self->{__useXS}) {
317 my @rest = _fetch_rest(); # complex matter need to parse
318 for my $line (@rest) {
319 next if $line =~ /^\s*#/;
320
321 if ($line =~ s/^\s*\@//) {
322 $self->parseAtmark($line);
323 } else {
324 $self->parseEntry($line);
325 }
326 }
327 return;
328 }
329
e7f779c8
RGS
330 my($f, $fh);
331 foreach my $d (@INC) {
332 $f = File::Spec->catfile($d, @Path, $self->{table});
333 last if open($fh, $f);
334 $f = undef;
335 }
0d50d293
RGS
336 if (!defined $f) {
337 $f = File::Spec->catfile(@Path, $self->{table});
338 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
339 }
a7fbee98 340
6d24ed10
SP
341 while (my $line = <$fh>) {
342 next if $line =~ /^\s*#/;
abd1ec54 343
00e00351
CBW
344 if ($line =~ s/^\s*\@//) {
345 $self->parseAtmark($line);
346 } else {
347 $self->parseEntry($line);
abd1ec54 348 }
45394607 349 }
e7f779c8 350 close $fh;
45394607
JH
351}
352
905aa9f0 353
45394607
JH
354##
355## get $line, parse it, and write an entry in $self
356##
357sub parseEntry
358{
a7fbee98
JH
359 my $self = shift;
360 my $line = shift;
4d36a948 361 my($name, $entry, @uv, @key);
a7fbee98
JH
362
363 return if $line !~ /^\s*[0-9A-Fa-f]/;
364
365 # removes comment and gets name
366 $name = $1
367 if $line =~ s/[#%]\s*(.*)//;
368 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
369
370 # gets element
371 my($e, $k) = split /;/, $line;
372 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
373 if ! $k;
374
4d36a948
ST
375 @uv = _getHexArray($e);
376 return if !@uv;
aa7758f7
CBW
377 return if @uv > 1 && $self->{suppressHash} &&
378 exists $self->{suppressHash}{$uv[0]};
4d36a948 379 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 380
4d36a948 381 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 382 my $ele = pack_U(@uv);
a7fbee98 383
4d36a948
ST
384 # regarded as if it were not entried in the table
385 return
386 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 387
4d36a948
ST
388 # replaced as completely ignorable
389 $k = '[.0000.0000.0000.0000]'
390 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 391 }
0116f5dc 392
4d36a948
ST
393 # replaced as completely ignorable
394 $k = '[.0000.0000.0000.0000]'
395 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
396
4c843366 397 my $is_L3_ignorable = TRUE;
4d36a948 398
caffd4cf
ST
399 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
400 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948
ST
401 my @wt = _getHexArray($arr);
402 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 403 $is_L3_ignorable = FALSE
3756e7ca
RGS
404 if $wt[0] || $wt[1] || $wt[2];
405 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
406 # is completely ignorable.
4c843366
JH
407 # For expansion, an entry $is_L3_ignorable
408 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 409 }
caffd4cf 410
e7f779c8 411 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
caffd4cf 412
91ae00cb 413 if (@uv > 1) {
b5d9a953
CBW
414 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
415 $self->{maxlength}{$uv[0]} = @uv;
416 }
417 }
418 if (@uv > 2) {
419 while (@uv) {
420 pop @uv;
421 my $fake_entry = join(CODE_SEP, @uv); # in JCPS
422 $self->{contraction}{$fake_entry} = 1;
423 }
91ae00cb 424 }
45394607
JH
425}
426
9f1f04a1 427
45394607
JH
428sub viewSortKey
429{
a7fbee98 430 my $self = shift;
9f1f04a1
RGS
431 $self->visualizeSortKey($self->getSortKey(@_));
432}
0116f5dc 433
d16e9e3d 434
45394607 435##
91ae00cb
NC
436## arrayref of JCPS = splitEnt(string to be collated)
437## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
45394607 438##
91ae00cb 439sub splitEnt
45394607 440{
a7fbee98 441 my $self = shift;
4d36a948
ST
442 my $wLen = $_[1];
443
a7fbee98 444 my $code = $self->{preprocess};
0116f5dc 445 my $norm = $self->{normCode};
91ae00cb 446 my $map = $self->{mapping};
a7fbee98
JH
447 my $max = $self->{maxlength};
448 my $reH = $self->{rearrangeHash};
b5d9a953
CBW
449 my $vers = $self->{UCA_Version};
450 my $ver9 = $vers >= 9 && $vers <= 11;
211cc501 451 my $uXS = $self->{__useXS};
a7fbee98 452
4d36a948 453 my ($str, @buf);
a7fbee98 454
4d36a948
ST
455 if ($wLen) {
456 $code and croak "Preprocess breaks character positions. "
457 . "Don't use with index(), match(), etc.";
458 $norm and croak "Normalization breaks character positions. "
459 . "Don't use with index(), match(), etc.";
460 $str = $_[0];
461 }
462 else {
463 $str = $_[0];
464 $str = &$code($str) if ref $code;
465 $str = &$norm($str) if ref $norm;
466 }
a7fbee98 467
4d36a948 468 # get array of Unicode code point of string.
9f1f04a1 469 my @src = unpack_U($str);
4d36a948
ST
470
471 # rearrangement:
472 # Character positions are not kept if rearranged,
473 # then neglected if $wLen is true.
474 if ($reH && ! $wLen) {
a7fbee98
JH
475 for (my $i = 0; $i < @src; $i++) {
476 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
477 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
478 $i++;
479 }
480 }
45394607 481 }
45394607 482
3756e7ca 483 # remove a code point marked as a completely ignorable.
abd1ec54 484 for (my $i = 0; $i < @src; $i++) {
b5d9a953
CBW
485 if (_isIllegal($src[$i]) || $vers <= 20 && _isNonchar($src[$i])) {
486 $src[$i] = undef;
487 } elsif ($ver9) {
488 $src[$i] = undef if $map->{ $src[$i] } &&
489 @{ $map->{ $src[$i] } } == 0;
211cc501
CBW
490 if ($uXS) {
491 $src[$i] = undef if _ignorable_simple($src[$i]);
492 }
b5d9a953 493 }
0116f5dc
JH
494 }
495
a7fbee98 496 for (my $i = 0; $i < @src; $i++) {
91ae00cb 497 my $jcps = $src[$i];
3756e7ca
RGS
498
499 # skip removed code point
500 if (! defined $jcps) {
501 if ($wLen && @buf) {
502 $buf[-1][2] = $i + 1;
503 }
504 next;
505 }
506
abd1ec54 507 my $i_orig = $i;
4d36a948 508
3756e7ca
RGS
509 # find contraction
510 if ($max->{$jcps}) {
91ae00cb
NC
511 my $temp_jcps = $jcps;
512 my $jcpsLen = 1;
513 my $maxLen = $max->{$jcps};
4d36a948 514
91ae00cb 515 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 516 next if ! defined $src[$p];
91ae00cb
NC
517 $temp_jcps .= CODE_SEP . $src[$p];
518 $jcpsLen++;
519 if ($map->{$temp_jcps}) {
520 $jcps = $temp_jcps;
4d36a948
ST
521 $i = $p;
522 }
523 }
4d36a948 524
b5d9a953 525 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
06c8fc8f 526 # This process requires Unicode::Normalize.
91ae00cb 527 # If "normalization" is undef, here should be skipped *always*
06c8fc8f
RGS
528 # (in spite of bool value of $CVgetCombinClass),
529 # since canonical ordering cannot be expected.
530 # Blocked combining character should not be contracted.
531
06c8fc8f 532 # $self->{normCode} is false in the case of "prenormalized".
b5d9a953
CBW
533 if ($self->{normalization}) {
534 my $cont = $self->{contraction};
06c8fc8f 535 my $preCC = 0;
b5d9a953
CBW
536 my $preCC_uc = 0;
537 my $jcps_uc = $jcps;
538 my(@out, @out_uc);
06c8fc8f
RGS
539
540 for (my $p = $i + 1; $p < @src; $p++) {
541 next if ! defined $src[$p];
b5d9a953 542 my $curCC = $CVgetCombinClass->($src[$p]);
06c8fc8f
RGS
543 last unless $curCC;
544 my $tail = CODE_SEP . $src[$p];
b5d9a953
CBW
545
546 if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
547 $cont->{$jcps_uc.$tail})) {
548 $jcps_uc .= $tail;
549 push @out_uc, $p;
550 } else {
551 $preCC_uc = $curCC;
552 }
553
91ae00cb
NC
554 if ($preCC != $curCC && $map->{$jcps.$tail}) {
555 $jcps .= $tail;
b5d9a953 556 push @out, $p;
06c8fc8f
RGS
557 } else {
558 $preCC = $curCC;
559 }
4d36a948 560 }
b5d9a953
CBW
561
562 if ($map->{$jcps_uc}) {
563 $jcps = $jcps_uc;
564 $src[$_] = undef for @out_uc;
565 } else {
566 $src[$_] = undef for @out;
567 }
a7fbee98 568 }
a7fbee98
JH
569 }
570
3756e7ca 571 # skip completely ignorable
211cc501
CBW
572 if ($uXS && $jcps =~ /^[0-9]+\z/ && _ignorable_simple($jcps) ||
573 $map->{$jcps} && @{ $map->{$jcps} } == 0) {
3756e7ca
RGS
574 if ($wLen && @buf) {
575 $buf[-1][2] = $i + 1;
a7fbee98 576 }
3756e7ca 577 next;
a7fbee98 578 }
4d36a948 579
91ae00cb 580 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 581 }
4d36a948 582 return \@buf;
d16e9e3d 583}
45394607 584
539ce3d8
CBW
585##
586## VCE = _pack_override(input, codepoint, derivCode)
587##
588sub _pack_override ($$$) {
589 my $r = shift;
590 my $u = shift;
591 my $der = shift;
592
593 if (ref $r) {
594 return pack(VCE_TEMPLATE, NON_VAR, @$r);
595 } elsif (defined $r) {
596 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
597 } else {
598 return $der->($u);
599 }
600}
d16e9e3d
JH
601
602##
abd1ec54 603## list of VCE = getWt(JCPS)
d16e9e3d
JH
604##
605sub getWt
606{
a7fbee98 607 my $self = shift;
91ae00cb 608 my $u = shift;
abd1ec54 609 my $vbl = $self->{variable};
91ae00cb 610 my $map = $self->{mapping};
0116f5dc 611 my $der = $self->{derivCode};
211cc501 612 my $uXS = $self->{__useXS};
a7fbee98 613
91ae00cb 614 return if !defined $u;
abd1ec54 615 return map(_varCE($vbl, $_), @{ $map->{$u} })
91ae00cb 616 if $map->{$u};
211cc501
CBW
617 return map(_varCE($vbl, $_), _fetch_simple($u))
618 if $uXS && _exists_simple($u);
a7fbee98 619
91ae00cb
NC
620 # JCPS must not be a contraction, then it's a code point.
621 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1
JH
622 my $hang = $self->{overrideHangul};
623 my @hangulCE;
624 if ($hang) {
539ce3d8
CBW
625 @hangulCE = map _pack_override($_, $u, $der), $hang->($u);
626 } elsif (!defined $hang) {
1d2654e1 627 @hangulCE = $der->($u);
539ce3d8 628 } else {
1d2654e1
JH
629 my $max = $self->{maxlength};
630 my @decH = _decompHangul($u);
631
632 if (@decH == 2) {
633 my $contract = join(CODE_SEP, @decH);
91ae00cb 634 @decH = ($contract) if $map->{$contract};
1d2654e1
JH
635 } else { # must be <@decH == 3>
636 if ($max->{$decH[0]}) {
637 my $contract = join(CODE_SEP, @decH);
91ae00cb 638 if ($map->{$contract}) {
1d2654e1
JH
639 @decH = ($contract);
640 } else {
641 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 642 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1
JH
643 }
644 # even if V's ignorable, LT contraction is not supported.
211cc501 645 # If such a situation were required, NFD should be used.
1d2654e1
JH
646 }
647 if (@decH == 3 && $max->{$decH[1]}) {
648 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 649 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1
JH
650 }
651 }
652
653 @hangulCE = map({
211cc501
CBW
654 $map->{$_} ? @{ $map->{$_} } :
655 $uXS && _exists_simple($_) ? _fetch_simple($_) :
656 $der->($_);
1d2654e1
JH
657 } @decH);
658 }
abd1ec54 659 return map _varCE($vbl, $_), @hangulCE;
539ce3d8 660 } else {
584e761d
CBW
661 my $cjk = $self->{overrideCJK};
662 my $vers = $self->{UCA_Version};
663 if ($cjk && _isUIdeo($u, $vers)) {
664 my @cjkCE = map _pack_override($_, $u, $der), $cjk->($u);
665 return map _varCE($vbl, $_), @cjkCE;
666 }
667 if ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
668 return map _varCE($vbl, $_), _uideoCE_8($u);
669 }
abd1ec54 670 return map _varCE($vbl, $_), $der->($u);
a7fbee98 671 }
d16e9e3d
JH
672}
673
d16e9e3d
JH
674
675##
676## string sortkey = getSortKey(string arg)
677##
678sub getSortKey
679{
a7fbee98 680 my $self = shift;
91ae00cb 681 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
00e00351 682 my $vers = $self->{UCA_Version};
aa7758f7
CBW
683 my $vbl = $self->{variable};
684 my $term = $self->{hangul_terminator};
91ae00cb 685
abd1ec54 686 my @buf; # weight arrays
aa7758f7 687 if ($term) {
91ae00cb 688 my $preHST = '';
aa7758f7 689 my $termCE = _varCE($vbl, pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
91ae00cb
NC
690 foreach my $jcps (@$rEnt) {
691 # weird things like VL, TL-contraction are not considered!
aa7758f7 692 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
91ae00cb
NC
693 if ($preHST && !$curHST || # hangul before non-hangul
694 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
695 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
696 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
aa7758f7 697 push @buf, $termCE;
91ae00cb
NC
698 }
699 $preHST = $curHST;
abd1ec54 700 push @buf, $self->getWt($jcps);
91ae00cb 701 }
aa7758f7
CBW
702 push @buf, $termCE if $preHST; # end at hangul
703 } else {
91ae00cb 704 foreach my $jcps (@$rEnt) {
abd1ec54 705 push @buf, $self->getWt($jcps);
91ae00cb
NC
706 }
707 }
708
211cc501 709 return $self->mk_SortKey(\@buf);
45394607
JH
710}
711
712
713##
d16e9e3d 714## int compare = cmp(string a, string b)
45394607 715##
5398038e 716sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
717sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
718sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
719sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
720sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
721sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
722sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607
JH
723
724##
d16e9e3d 725## list[strings] sorted = sort(list[strings] arg)
45394607 726##
a7fbee98
JH
727sub sort {
728 my $obj = shift;
729 return
730 map { $_->[1] }
731 sort{ $a->[0] cmp $b->[0] }
732 map [ $obj->getSortKey($_), $_ ], @_;
45394607
JH
733}
734
0116f5dc 735
4d36a948
ST
736##
737## bool _nonIgnorAtLevel(arrayref weights, int level)
738##
739sub _nonIgnorAtLevel($$)
740{
741 my $wt = shift;
742 return if ! defined $wt;
743 my $lv = shift;
9f1f04a1 744 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948
ST
745}
746
747##
748## bool _eqArray(
749## arrayref of arrayref[weights] source,
750## arrayref of arrayref[weights] substr,
751## int level)
752## * comparison of graphemes vs graphemes.
753## @$source >= @$substr must be true (check it before call this);
754##
755sub _eqArray($$$)
756{
757 my $source = shift;
758 my $substr = shift;
759 my $lev = shift;
760
761 for my $g (0..@$substr-1){
211cc501 762 # Do the $g'th graphemes have the same number of AV weights?
4d36a948
ST
763 return if @{ $source->[$g] } != @{ $substr->[$g] };
764
765 for my $w (0..@{ $substr->[$g] }-1) {
766 for my $v (0..$lev-1) {
767 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
768 }
769 }
770 }
771 return 1;
772}
773
774##
775## (int position, int length)
776## int position = index(string, substring, position, [undoc'ed grobal])
777##
778## With "grobal" (only for the list context),
779## returns list of arrayref[position, length].
780##
781sub index
782{
91ae00cb
NC
783 my $self = shift;
784 my $str = shift;
785 my $len = length($str);
786 my $subE = $self->splitEnt(shift);
787 my $pos = @_ ? shift : 0;
788 $pos = 0 if $pos < 0;
789 my $grob = shift;
790
791 my $lev = $self->{level};
3756e7ca
RGS
792 my $v2i = $self->{UCA_Version} >= 9 &&
793 $self->{variable} ne 'non-ignorable';
91ae00cb
NC
794
795 if (! @$subE) {
4d36a948
ST
796 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
797 return $grob
798 ? map([$_, 0], $temp..$len)
799 : wantarray ? ($temp,0) : $temp;
800 }
abd1ec54
NC
801 $len < $pos
802 and return wantarray ? () : NOMATCHPOS;
91ae00cb 803 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
abd1ec54
NC
804 @$strE
805 or return wantarray ? () : NOMATCHPOS;
806
4d36a948
ST
807 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
808
abd1ec54
NC
809 my $last_is_variable;
810 for my $vwt (map $self->getWt($_), @$subE) {
811 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
812 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 813
3756e7ca
RGS
814 # "Ignorable (L1, L2) after Variable" since track. v. 9
815 if ($v2i) {
abd1ec54
NC
816 if ($var) {
817 $last_is_variable = TRUE;
818 }
819 elsif (!$wt[0]) { # ignorable
4d36a948 820 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
821 }
822 else {
823 $last_is_variable = FALSE;
4d36a948
ST
824 }
825 }
826
abd1ec54
NC
827 if (@subWt && !$var && !$wt[0]) {
828 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
1393fe00 829 } elsif ($to_be_pushed) {
abd1ec54 830 push @subWt, [ \@wt ];
4d36a948 831 }
1393fe00 832 # else ===> skipped
4d36a948
ST
833 }
834
835 my $count = 0;
91ae00cb 836 my $end = @$strE - 1;
4d36a948 837
abd1ec54 838 $last_is_variable = FALSE; # reuse
4d36a948
ST
839 for (my $i = 0; $i <= $end; ) { # no $i++
840 my $found_base = 0;
841
842 # fetch a grapheme
843 while ($i <= $end && $found_base == 0) {
abd1ec54
NC
844 for my $vwt ($self->getWt($strE->[$i][0])) {
845 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
846 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 847
3756e7ca
RGS
848 # "Ignorable (L1, L2) after Variable" since track. v. 9
849 if ($v2i) {
abd1ec54
NC
850 if ($var) {
851 $last_is_variable = TRUE;
852 }
853 elsif (!$wt[0]) { # ignorable
4d36a948 854 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
855 }
856 else {
857 $last_is_variable = FALSE;
4d36a948
ST
858 }
859 }
860
abd1ec54
NC
861 if (@strWt && !$var && !$wt[0]) {
862 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
91ae00cb 863 $finPos[-1] = $strE->[$i][2];
4d36a948 864 } elsif ($to_be_pushed) {
abd1ec54 865 push @strWt, [ \@wt ];
91ae00cb 866 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 867 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 868 push @finPos, $strE->[$i][2];
4d36a948
ST
869 $found_base++;
870 }
871 # else ===> no-op
872 }
873 $i++;
874 }
875
876 # try to match
877 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
878 if ($iniPos[0] != NOMATCHPOS &&
879 $finPos[$#subWt] != NOMATCHPOS &&
880 _eqArray(\@strWt, \@subWt, $lev)) {
881 my $temp = $iniPos[0] + $pos;
882
883 if ($grob) {
884 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
885 splice @strWt, 0, $#subWt;
886 splice @iniPos, 0, $#subWt;
887 splice @finPos, 0, $#subWt;
888 }
889 else {
890 return wantarray
891 ? ($temp, $finPos[$#subWt] - $iniPos[0])
892 : $temp;
893 }
894 }
895 shift @strWt;
896 shift @iniPos;
897 shift @finPos;
898 }
899 }
900
901 return $grob
902 ? @g_ret
903 : wantarray ? () : NOMATCHPOS;
904}
905
906##
907## scalarref to matching part = match(string, substring)
908##
909sub match
910{
911 my $self = shift;
912 if (my($pos,$len) = $self->index($_[0], $_[1])) {
913 my $temp = substr($_[0], $pos, $len);
914 return wantarray ? $temp : \$temp;
915 # An lvalue ref \substr should be avoided,
916 # since its value is affected by modification of its referent.
917 }
918 else {
919 return;
920 }
921}
922
923##
924## arrayref matching parts = gmatch(string, substring)
925##
926sub gmatch
927{
928 my $self = shift;
929 my $str = shift;
930 my $sub = shift;
931 return map substr($str, $_->[0], $_->[1]),
932 $self->index($str, $sub, 0, 'g');
933}
934
935##
936## bool subst'ed = subst(string, substring, replace)
937##
938sub subst
939{
940 my $self = shift;
941 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
942
943 if (my($pos,$len) = $self->index($_[0], $_[1])) {
944 if ($code) {
945 my $mat = substr($_[0], $pos, $len);
946 substr($_[0], $pos, $len, $code->($mat));
947 } else {
948 substr($_[0], $pos, $len, $_[2]);
949 }
950 return TRUE;
951 }
952 else {
953 return FALSE;
954 }
955}
956
957##
958## int count = gsubst(string, substring, replace)
959##
960sub gsubst
961{
962 my $self = shift;
963 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
964 my $cnt = 0;
965
966 # Replacement is carried out from the end, then use reverse.
967 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
968 if ($code) {
969 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
970 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
971 } else {
972 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
973 }
974 $cnt++;
975 }
976 return $cnt;
977}
978
45394607
JH
9791;
980__END__
981
982=head1 NAME
983
a7fbee98 984Unicode::Collate - Unicode Collation Algorithm
45394607
JH
985
986=head1 SYNOPSIS
987
988 use Unicode::Collate;
989
990 #construct
5398038e 991 $Collator = Unicode::Collate->new(%tailoring);
45394607
JH
992
993 #sort
5398038e 994 @sorted = $Collator->sort(@not_sorted);
45394607
JH
995
996 #compare
a7fbee98 997 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 998
539ce3d8
CBW
999B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1000according to Perl's Unicode support. See L<perlunicode>,
1001L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1002Otherwise you can use C<preprocess> or should decode them before.
91ae00cb 1003
45394607
JH
1004=head1 DESCRIPTION
1005
3756e7ca
RGS
1006This module is an implementation of Unicode Technical Standard #10
1007(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
4d36a948 1008
45394607
JH
1009=head2 Constructor and Tailoring
1010
539ce3d8
CBW
1011The C<new> method returns a collator object. If new() is called
1012with no parameters, the collator should do the default collation.
d16e9e3d 1013
5398038e 1014 $Collator = Unicode::Collate->new(
0116f5dc 1015 UCA_Version => $UCA_Version,
aa7758f7 1016 alternate => $alternate, # alias for 'variable'
45394607
JH
1017 backwards => $levelNumber, # or \@levelNumbers
1018 entry => $element,
91ae00cb 1019 hangul_terminator => $term_primary_weight,
45394607
JH
1020 ignoreName => qr/$ignoreName/,
1021 ignoreChar => qr/$ignoreChar/,
1022 katakana_before_hiragana => $bool,
1023 level => $collationLevel,
91ae00cb 1024 normalization => $normalization_form,
45394607
JH
1025 overrideCJK => \&overrideCJK,
1026 overrideHangul => \&overrideHangul,
1027 preprocess => \&preprocess,
1028 rearrange => \@charList,
aa7758f7 1029 suppress => \@charList,
45394607
JH
1030 table => $filename,
1031 undefName => qr/$undefName/,
1032 undefChar => qr/$undefChar/,
1033 upper_before_lower => $bool,
91ae00cb 1034 variable => $variable,
45394607 1035 );
45394607
JH
1036
1037=over 4
1038
0116f5dc
JH
1039=item UCA_Version
1040
3756e7ca
RGS
1041If the tracking version number of UCA is given,
1042behavior of that tracking version is emulated on collating.
0116f5dc 1043If omitted, the return value of C<UCA_Version()> is used.
3756e7ca 1044
b5d9a953 1045The following tracking versions are supported. The default is 20.
3756e7ca 1046
0d50d293 1047 UCA Unicode Standard DUCET (@version)
b5d9a953 1048 -------------------------------------------------------
0d50d293
RGS
1049 8 3.1 3.0.1 (3.0.1d9)
1050 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1051 11 4.0 4.0.0 (4.0.0)
1052 14 4.1.0 4.1.0 (4.1.0)
74b94a79
CBW
1053 16 5.0 5.0.0 (5.0.0)
1054 18 5.1.0 5.1.0 (5.1.0)
00e00351 1055 20 5.2.0 5.2.0 (5.2.0)
b5d9a953 1056 22 6.0.0 6.0.0 (6.0.0)
3756e7ca
RGS
1057
1058Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1059
211cc501 1060* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
b5d9a953
CBW
1061since C<UCA_Version> 22.
1062
1063* Fully ignorable characters were ignored, and would not interrupt
1064contractions with C<UCA_Version> 9 and 11.
1065
1066* Treatment of ignorables after variables and some behaviors
1067were changed at C<UCA_Version> 9.
1068
1069* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1070depend on C<UCA_Version>.
1071
1072* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1073C<hangul_terminator>.
1074
3756e7ca 1075=item alternate
0116f5dc 1076
3756e7ca 1077-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
0116f5dc 1078
3756e7ca
RGS
1079For backward compatibility, C<alternate> (old name) can be used
1080as an alias for C<variable>.
0116f5dc 1081
45394607
JH
1082=item backwards
1083
4d36a948 1084-- see 3.1.2 French Accents, UTS #10.
45394607
JH
1085
1086 backwards => $levelNumber or \@levelNumbers
1087
1088Weights in reverse order; ex. level 2 (diacritic ordering) in French.
68adb2b0
CBW
1089If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1090forwards at all the levels.
45394607
JH
1091
1092=item entry
1093
4d36a948 1094-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1095
91ae00cb
NC
1096If the same character (or a sequence of characters) exists
1097in the collation element table through C<table>,
211cc501 1098mapping to collation elements is overridden.
91ae00cb 1099If it does not exist, the mapping is defined additionally.
45394607 1100
abd1ec54
NC
1101 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11020063 0068 ; [.0E6A.0020.0002.0063] # ch
11030043 0068 ; [.0E6A.0020.0007.0043] # Ch
11040043 0048 ; [.0E6A.0020.0008.0043] # CH
1105006C 006C ; [.0F4C.0020.0002.006C] # ll
1106004C 006C ; [.0F4C.0020.0007.004C] # Ll
1107004C 004C ; [.0F4C.0020.0008.004C] # LL
e7f779c8
RGS
110800F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1109006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
111000D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1111004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
abd1ec54
NC
1112ENTRY
1113
1114 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
111500E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
111600C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1117ENTRY
45394607 1118
4d36a948 1119B<NOTE:> The code point in the UCA file format (before C<';'>)
abd1ec54
NC
1120B<must> be a Unicode code point (defined as hexadecimal),
1121but not a native code point.
4d36a948
ST
1122So C<0063> must always denote C<U+0063>,
1123but not a character of C<"\x63">.
1124
abd1ec54
NC
1125Weighting may vary depending on collation element table.
1126So ensure the weights defined in C<entry> will be consistent with
1127those in the collation element table loaded via C<table>.
1128
1129In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1130and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1131(as a value between C<0E60> and C<0E6D>)
1132makes ordering as C<C E<lt> CH E<lt> D>.
1133Exactly speaking DUCET already has some characters between C<C> and C<D>:
1134C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1135C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1136and C<c-curl> (C<U+0255>) with C<0E69>.
1137Then primary weight C<0E6A> for C<CH> makes C<CH>
1138ordered between C<c-curl> and C<D>.
1139
91ae00cb
NC
1140=item hangul_terminator
1141
3756e7ca 1142-- see 7.1.4 Trailing Weights, UTS #10.
91ae00cb
NC
1143
1144If a true value is given (non-zero but should be positive),
1145it will be added as a terminator primary weight to the end of
1146every standard Hangul syllable. Secondary and any higher weights
1147for terminator are set to zero.
1148If the value is false or C<hangul_terminator> key does not exist,
1149insertion of terminator weights will not be performed.
1150
1151Boundaries of Hangul syllables are determined
1152according to conjoining Jamo behavior in F<the Unicode Standard>
1153and F<HangulSyllableType.txt>.
1154
1155B<Implementation Note:>
1156(1) For expansion mapping (Unicode character mapped
1157to a sequence of collation elements), a terminator will not be added
1158between collation elements, even if Hangul syllable boundary exists there.
1159Addition of terminator is restricted to the next position
1160to the last collation element.
1161
1162(2) Non-conjoining Hangul letters
1163(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1164automatically terminated with a terminator primary weight.
1165These characters may need terminator included in a collation element
1166table beforehand.
1167
45394607
JH
1168=item ignoreChar
1169
3756e7ca
RGS
1170=item ignoreName
1171
1172-- see 3.2.2 Variable Weighting, UTS #10.
45394607 1173
caffd4cf
ST
1174Makes the entry in the table completely ignorable;
1175i.e. as if the weights were zero at all level.
45394607 1176
3756e7ca
RGS
1177Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1178will be ignored. Through C<ignoreName>, any character whose name
1179(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1180will be ignored.
1181
a7fbee98 1182E.g. when 'a' and 'e' are ignorable,
45394607
JH
1183'element' is equal to 'lament' (or 'lmnt').
1184
3756e7ca
RGS
1185=item katakana_before_hiragana
1186
1187-- see 7.3.1 Tertiary Weight Table, UTS #10.
1188
1189By default, hiragana is before katakana.
1190If the parameter is made true, this is reversed.
1191
1192B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1193distinctions must occur in level 3, and their weights at level 3 must be
1194same as those mentioned in 7.3.1, UTS #10.
1195If you define your collation elements which violate this requirement,
1196this parameter does not work validly.
1197
45394607
JH
1198=item level
1199
3756e7ca 1200-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1201
1202Set the maximum level.
1203Any higher levels than the specified one are ignored.
1204
1205 Level 1: alphabetic ordering
1206 Level 2: diacritic ordering
1207 Level 3: case ordering
91ae00cb 1208 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607
JH
1209
1210 ex.level => 2,
1211
a7fbee98
JH
1212If omitted, the maximum is the 4th.
1213
45394607
JH
1214=item normalization
1215
3756e7ca 1216-- see 4.1 Normalize, UTS #10.
45394607 1217
905aa9f0 1218If specified, strings are normalized before preparation of sort keys
45394607
JH
1219(the normalization is executed after preprocess).
1220
1d2654e1
JH
1221A form name C<Unicode::Normalize::normalize()> accepts will be applied
1222as C<$normalization_form>.
06c8fc8f 1223Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1
JH
1224See C<Unicode::Normalize::normalize()> for detail.
1225If omitted, C<'NFD'> is used.
45394607 1226
91ae00cb 1227C<normalization> is performed after C<preprocess> (if defined).
45394607 1228
06c8fc8f
RGS
1229Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1230though they are not concerned with C<Unicode::Normalize::normalize()>.
1231
1232If C<undef> (not a string C<"undef">) is passed explicitly
1233as the value for this key,
45394607 1234any normalization is not carried out (this may make tailoring easier
abd1ec54
NC
1235if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1236only contiguous contractions are resolved;
1237e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1238C<A-cedilla-ring> would be primary equal to C<A>.
06c8fc8f
RGS
1239In this point,
1240C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1241B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1242
1243In the case of C<(normalization =E<gt> "prenormalized")>,
1244any normalization is not performed, but
b5d9a953 1245discontiguous contractions with combining characters are performed.
06c8fc8f
RGS
1246Therefore
1247C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1248B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1249If source strings are finely prenormalized,
1250C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1251
1252Except C<(normalization =E<gt> undef)>,
1253B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607
JH
1254
1255=item overrideCJK
1256
4d36a948 1257-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1258
584e761d
CBW
1259By default, CJK unified ideographs are ordered in Unicode codepoint
1260order, but those in the CJK Unified Ideographs block are lesser than
1261those in the CJK Unified Ideographs Extension A etc.
00e00351 1262
b5d9a953
CBW
1263 In the CJK Unified Ideographs block:
1264 U+4E00..U+9FA5 if UCA_Version is 8 to 11.
1265 U+4E00..U+9FBB if UCA_Version is 14 to 16.
1266 U+4E00..U+9FC3 if UCA_Version is 18.
1267 U+4E00..U+9FCB if UCA_Version is 20 or greater.
00e00351 1268
b5d9a953
CBW
1269 In the CJK Unified Ideographs Extension blocks:
1270 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
1271 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or greater.
1272 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or greater.
91ae00cb 1273
584e761d 1274Through C<overrideCJK>, ordering of CJK unified ideographs (including
211cc501 1275extensions) can be overridden.
45394607 1276
584e761d 1277ex. CJK unified ideographs in the JIS code point order.
45394607
JH
1278
1279 overrideCJK => sub {
a7fbee98
JH
1280 my $u = shift; # get a Unicode codepoint
1281 my $b = pack('n', $u); # to UTF-16BE
1282 my $s = your_unicode_to_sjis_converter($b); # convert
1283 my $n = unpack('n', $s); # convert sjis to short
1284 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607
JH
1285 },
1286
539ce3d8
CBW
1287The return value may be an arrayref of 1st to 4th weights as shown
1288above. The return value may be an integer as the primary weight
1289as shown below. If C<undef> is returned, the default derived
1290collation element will be used.
1291
1292 overrideCJK => sub {
1293 my $u = shift; # get a Unicode codepoint
1294 my $b = pack('n', $u); # to UTF-16BE
1295 my $s = your_unicode_to_sjis_converter($b); # convert
1296 my $n = unpack('n', $s); # convert sjis to short
1297 return $n; # return the primary weight
1298 },
1299
1300The return value may be a list containing zero or more of
1301an arrayref, an integer, or C<undef>.
1302
584e761d 1303ex. ignores all CJK unified ideographs.
a7fbee98
JH
1304
1305 overrideCJK => sub {()}, # CODEREF returning empty list
1306
1307 # where ->eq("Pe\x{4E00}rl", "Perl") is true
584e761d 1308 # as U+4E00 is a CJK unified ideograph and to be ignorable.
a7fbee98
JH
1309
1310If C<undef> is passed explicitly as the value for this key,
584e761d
CBW
1311weights for CJK unified ideographs are treated as undefined.
1312But assignment of weight for CJK unified ideographs
1313in C<table> or C<entry> is still valid.
1314
1315B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1316C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1317C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
211cc501 1318ideographs. But they can't be overridden via C<overrideCJK> when you use
584e761d
CBW
1319DUCET, as the table includes weights for them. C<table> or C<entry> has
1320priority over C<overrideCJK>.
a7fbee98
JH
1321
1322=item overrideHangul
1323
4d36a948 1324-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1325
584e761d 1326By default, Hangul syllables are decomposed into Hangul Jamo,
abd1ec54 1327even if C<(normalization =E<gt> undef)>.
211cc501 1328But the mapping of Hangul syllables may be overridden.
a7fbee98 1329
3756e7ca 1330This parameter works like C<overrideCJK>, so see there for examples.
a7fbee98 1331
584e761d
CBW
1332If you want to override the mapping of Hangul syllables,
1333NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1334Hangul syllables before overriding. FCD may decompose Hangul syllables
1335as the case may be.
45394607 1336
a7fbee98 1337If C<undef> is passed explicitly as the value for this key,
584e761d 1338weight for Hangul syllables is treated as undefined
a7fbee98 1339without decomposition into Hangul Jamo.
584e761d
CBW
1340But definition of weight for Hangul syllables
1341in C<table> or C<entry> is still valid.
a7fbee98 1342
45394607
JH
1343=item preprocess
1344
4d36a948 1345-- see 5.1 Preprocessing, UTS #10.
45394607
JH
1346
1347If specified, the coderef is used to preprocess
1348before the formation of sort keys.
1349
a7fbee98 1350ex. dropping English articles, such as "a" or "the".
45394607
JH
1351Then, "the pen" is before "a pencil".
1352
1353 preprocess => sub {
1354 my $str = shift;
a7fbee98 1355 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1356 return $str;
45394607
JH
1357 },
1358
91ae00cb 1359C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1360
539ce3d8
CBW
1361ex. decoding strings in a legacy encoding such as shift-jis:
1362
1363 $sjis_collator = Unicode::Collate->new(
1364 preprocess => \&your_shiftjis_to_unicode_decoder,
1365 );
1366 @result = $sjis_collator->sort(@shiftjis_strings);
1367
1368B<Note:> Strings returned from the coderef will be interpreted
1369according to Perl's Unicode support. See L<perlunicode>,
1370L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1371
45394607
JH
1372=item rearrange
1373
4d36a948 1374-- see 3.1.3 Rearrangement, UTS #10.
45394607
JH
1375
1376Characters that are not coded in logical order and to be rearranged.
3756e7ca 1377If C<UCA_Version> is equal to or lesser than 11, default is:
45394607
JH
1378
1379 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1380
3756e7ca
RGS
1381If you want to disallow any rearrangement, pass C<undef> or C<[]>
1382(a reference to empty list) as the value for this key.
1383
74b94a79
CBW
1384If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1385(i.e. no rearrangement).
a7fbee98 1386
0116f5dc
JH
1387B<According to the version 9 of UCA, this parameter shall not be used;
1388but it is not warned at present.>
1389
aa7758f7
CBW
1390=item suppress
1391
1392-- see suppress contractions in 5.14.11 Special-Purpose Commands,
1393UTS #35 (LDML).
1394
539ce3d8 1395Contractions beginning with the specified characters are suppressed,
584e761d 1396even if those contractions are defined in C<table> or C<entry>.
aa7758f7
CBW
1397
1398An example for Russian and some languages using the Cyrillic script:
1399
1400 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1401
1402where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1403
45394607
JH
1404=item table
1405
4d36a948 1406-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1407
91ae00cb 1408You can use another collation element table if desired.
45394607 1409
e7f779c8 1410The table file should locate in the F<Unicode/Collate> directory
3756e7ca
RGS
1411on C<@INC>. Say, if the filename is F<Foo.txt>,
1412the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
e7f779c8
RGS
1413
1414By default, F<allkeys.txt> (as the filename of DUCET) is used.
0d50d293
RGS
1415If you will prepare your own table file, any name other than F<allkeys.txt>
1416may be better to avoid namespace conflict.
45394607 1417
00e00351
CBW
1418B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1419module, and it may save time at the run time.
1420Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
1421or using C<ignoreChar>, C<ignoreName>, C<undefChar>, or C<undefName>
1393fe00 1422will prevent this module from using the compiled DUCET.
00e00351 1423
a7fbee98 1424If C<undef> is passed explicitly as the value for this key,
91ae00cb 1425no file is read (but you can define collation elements via C<entry>).
a7fbee98
JH
1426
1427A typical way to define a collation element table
1428without any file of table:
1429
1430 $onlyABC = Unicode::Collate->new(
1431 table => undef,
1432 entry => << 'ENTRIES',
14330061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
14340041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
14350062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
14360042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
14370063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
14380043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1439ENTRIES
1440 );
905aa9f0 1441
3756e7ca
RGS
1442If C<ignoreName> or C<undefName> is used, character names should be
1443specified as a comment (following C<#>) on each line.
45394607
JH
1444
1445=item undefChar
1446
3756e7ca
RGS
1447=item undefName
1448
4d36a948 1449-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1450
584e761d 1451Undefines the collation element as if it were unassigned in the C<table>.
45394607
JH
1452This reduces the size of the table.
1453If an unassigned character appears in the string to be collated,
1454the sort key is made from its codepoint
1455as a single-character collation element,
1456as it is greater than any other assigned collation elements
1457(in the codepoint order among the unassigned characters).
1458But, it'd be better to ignore characters
1459unfamiliar to you and maybe never used.
1460
3756e7ca
RGS
1461Through C<undefChar>, any character matching C<qr/$undefChar/>
1462will be undefined. Through C<undefName>, any character whose name
1463(given in the C<table> file as a comment) matches C<qr/$undefName/>
1464will be undefined.
1465
e7f779c8
RGS
1466ex. Collation weights for beyond-BMP characters are not stored in object:
1467
1468 undefChar => qr/[^\0-\x{fffd}]/,
1469
45394607
JH
1470=item upper_before_lower
1471
3756e7ca 1472-- see 6.6 Case Comparisons, UTS #10.
45394607 1473
3756e7ca
RGS
1474By default, lowercase is before uppercase.
1475If the parameter is made true, this is reversed.
45394607 1476
3756e7ca
RGS
1477B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1478distinctions must occur in level 3, and their weights at level 3 must be
1479same as those mentioned in 7.3.1, UTS #10.
1480If you define your collation elements which differs from this requirement,
1481this parameter doesn't work validly.
45394607 1482
91ae00cb
NC
1483=item variable
1484
91ae00cb
NC
1485-- see 3.2.2 Variable Weighting, UTS #10.
1486
91ae00cb
NC
1487This key allows to variable weighting for variable collation elements,
1488which are marked with an ASTERISK in the table
211cc501 1489(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
91ae00cb
NC
1490
1491 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1492
1493These names are case-insensitive.
1494By default (if specification is omitted), 'shifted' is adopted.
1495
1496 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1497 considered at the 4th level.
1498
abd1ec54 1499 'Non-Ignorable' Variable elements are not reset to ignorable.
91ae00cb
NC
1500
1501 'Shifted' Variable elements are made ignorable at levels 1 through 3
1502 their level 4 weight is replaced by the old level 1 weight.
1503 Level 4 weight for Non-Variable elements is 0xFFFF.
1504
1505 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1506 are trimmed.
1507
45394607
JH
1508=back
1509
3164dd77 1510=head2 Methods for Collation
45394607
JH
1511
1512=over 4
1513
5398038e 1514=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607
JH
1515
1516Sorts a list of strings.
1517
5398038e 1518=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607
JH
1519
1520Returns 1 (when C<$a> is greater than C<$b>)
1521or 0 (when C<$a> is equal to C<$b>)
1522or -1 (when C<$a> is lesser than C<$b>).
1523
5398038e 1524=item C<$result = $Collator-E<gt>eq($a, $b)>
1525
1526=item C<$result = $Collator-E<gt>ne($a, $b)>
1527
1528=item C<$result = $Collator-E<gt>lt($a, $b)>
1529
1530=item C<$result = $Collator-E<gt>le($a, $b)>
1531
1532=item C<$result = $Collator-E<gt>gt($a, $b)>
1533
1534=item C<$result = $Collator-E<gt>ge($a, $b)>
1535
a7fbee98 1536They works like the same name operators as theirs.
5398038e 1537
1538 eq : whether $a is equal to $b.
1539 ne : whether $a is not equal to $b.
1540 lt : whether $a is lesser than $b.
1541 le : whether $a is lesser than $b or equal to $b.
1542 gt : whether $a is greater than $b.
1543 ge : whether $a is greater than $b or equal to $b.
1544
1545=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1546
3756e7ca 1547-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1548
1549Returns a sort key.
1550
1551You compare the sort keys using a binary comparison
1552and get the result of the comparison of the strings using UCA.
1553
5398038e 1554 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607
JH
1555
1556 is equivalent to
1557
5398038e 1558 $Collator->cmp($a, $b)
45394607 1559
a7fbee98
JH
1560=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1561
3756e7ca
RGS
1562Converts a sorting key into its representation form.
1563If C<UCA_Version> is 8, the output is slightly different.
1564
a7fbee98
JH
1565 use Unicode::Collate;
1566 my $c = Unicode::Collate->new();
1567 print $c->viewSortKey("Perl"),"\n";
1568
0116f5dc
JH
1569 # output:
1570 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1571 # Level 1 Level 2 Level 3 Level 4
1572
4d36a948
ST
1573=back
1574
1575=head2 Methods for Searching
d16e9e3d 1576
3756e7ca 1577B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
4d36a948
ST
1578for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1579C<subst>, C<gsubst>) is croaked,
1580as the position and the length might differ
1581from those on the specified string.
3756e7ca 1582(And C<rearrange> and C<hangul_terminator> parameters are neglected.)
d16e9e3d 1583
4d36a948
ST
1584The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1585like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1586but they are not aware of any pattern, but only a literal substring.
1587
1588=over 4
1589
1590=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1591
1592=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d
JH
1593
1594If C<$substring> matches a part of C<$string>, returns
1595the position of the first occurrence of the matching part in scalar context;
1596in list context, returns a two-element list of
1597the position and the length of the matching part.
1598
d16e9e3d
JH
1599If C<$substring> does not match any part of C<$string>,
1600returns C<-1> in scalar context and
1601an empty list in list context.
1602
1603e.g. you say
1604
5398038e 1605 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948
ST
1606 # (normalization => undef) is REQUIRED.
1607 my $str = "Ich muß studieren Perl.";
1608 my $sub = "MÜSS";
d16e9e3d 1609 my $match;
a7fbee98 1610 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1611 $match = substr($str, $pos, $len);
d16e9e3d
JH
1612 }
1613
4d36a948 1614and get C<"muß"> in C<$match> since C<"muß">
3756e7ca 1615is primary equal to C<"MÜSS">.
4d36a948
ST
1616
1617=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1618
1619=item C<($match) = $Collator-E<gt>match($string, $substring)>
1620
1621If C<$substring> matches a part of C<$string>, in scalar context, returns
1622B<a reference to> the first occurrence of the matching part
1623(C<$match_ref> is always true if matches,
1624since every reference is B<true>);
1625in list context, returns the first occurrence of the matching part.
1626
1627If C<$substring> does not match any part of C<$string>,
1628returns C<undef> in scalar context and
1629an empty list in list context.
1630
1631e.g.
1632
1633 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1634 print "matches [$$match_ref].\n";
1635 } else {
1636 print "doesn't match.\n";
1637 }
1638
3756e7ca 1639 or
4d36a948
ST
1640
1641 if (($match) = $Collator->match($str, $sub)) { # list context
1642 print "matches [$match].\n";
1643 } else {
1644 print "doesn't match.\n";
1645 }
1646
1647=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1648
1649If C<$substring> matches a part of C<$string>, returns
1650all the matching parts (or matching count in scalar context).
1651
1652If C<$substring> does not match any part of C<$string>,
1653returns an empty list.
1654
1655=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1656
1657If C<$substring> matches a part of C<$string>,
1658the first occurrence of the matching part is replaced by C<$replacement>
1659(C<$string> is modified) and return C<$count> (always equals to C<1>).
1660
1661C<$replacement> can be a C<CODEREF>,
1662taking the matching part as an argument,
1663and returning a string to replace the matching part
1664(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1665
1666=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1667
1668If C<$substring> matches a part of C<$string>,
1669all the occurrences of the matching part is replaced by C<$replacement>
1670(C<$string> is modified) and return C<$count>.
1671
1672C<$replacement> can be a C<CODEREF>,
1673taking the matching part as an argument,
1674and returning a string to replace the matching part
1675(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1676
1677e.g.
1678
1679 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1680 # (normalization => undef) is REQUIRED.
3756e7ca 1681 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
4d36a948
ST
1682 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1683
3756e7ca 1684 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
4d36a948 1685 # i.e., all the camels are made bold-faced.
d16e9e3d 1686
45394607
JH
1687=back
1688
3164dd77
ST
1689=head2 Other Methods
1690
1691=over 4
1692
0116f5dc
JH
1693=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1694
68adb2b0
CBW
1695=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1696
0116f5dc
JH
1697Change the value of specified keys and returns the changed part.
1698
1699 $Collator = Unicode::Collate->new(level => 4);
1700
1701 $Collator->eq("perl", "PERL"); # false
1702
1703 %old = $Collator->change(level => 2); # returns (level => 4).
1704
1705 $Collator->eq("perl", "PERL"); # true
1706
1707 $Collator->change(%old); # returns (level => 2).
1708
1709 $Collator->eq("perl", "PERL"); # false
1710
1711Not all C<(key,value)>s are allowed to be changed.
1712See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1713
1714In the scalar context, returns the modified collator
1715(but it is B<not> a clone from the original).
1716
1717 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1718
1719 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1720
1721 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1722
91ae00cb 1723=item C<$version = $Collator-E<gt>version()>
3164dd77 1724
91ae00cb
NC
1725Returns the version number (a string) of the Unicode Standard
1726which the C<table> file used by the collator object is based on.
1727If the table does not include a version line (starting with C<@version>),
1728returns C<"unknown">.
1729
1730=item C<UCA_Version()>
3164dd77 1731
91ae00cb 1732Returns the tracking version number of UTS #10 this module consults.
b5d9a953
CBW
1733C<UCA_Version()> should return the tracking version corresponding
1734with the DUCET incorporated.
3164dd77 1735
91ae00cb
NC
1736=item C<Base_Unicode_Version()>
1737
1738Returns the version number of UTS #10 this module consults.
3164dd77
ST
1739
1740=back
1741
3756e7ca
RGS
1742=head1 EXPORT
1743
1744No method will be exported.
45394607 1745
0d50d293
RGS
1746=head1 INSTALL
1747
1748Though this module can be used without any C<table> file,
1749to use this module easily, it is recommended to install a table file
1750in the UCA format, by copying it under the directory
1751<a place in @INC>/Unicode/Collate.
1752
6d24ed10
SP
1753The most preferable one is "The Default Unicode Collation Element Table"
1754(aka DUCET), available from the Unicode Consortium's website:
0d50d293
RGS
1755
1756 http://www.unicode.org/Public/UCA/
1757
1758 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1759
1760If DUCET is not installed, it is recommended to copy the file
1761from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1762to <a place in @INC>/Unicode/Collate/allkeys.txt
1763manually.
1764
3756e7ca
RGS
1765=head1 CAVEATS
1766
1767=over 4
45394607 1768
3756e7ca 1769=item Normalization
45394607 1770
3756e7ca
RGS
1771Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1772module (see L<Unicode::Normalize>).
45394607 1773
5398038e 1774If you need not it (say, in the case when you need not
45394607
JH
1775handle any combining characters),
1776assign C<normalization =E<gt> undef> explicitly.
1777
4d36a948 1778-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1779
3756e7ca 1780=item Conformance Test
0116f5dc 1781
10d7ec48
NC
1782The Conformance Test for the UCA is available
1783under L<http://www.unicode.org/Public/UCA/>.
0116f5dc
JH
1784
1785For F<CollationTest_SHIFTED.txt>,
1786a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1787for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 1788C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 1789
4d36a948 1790B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1791
3756e7ca
RGS
1792=back
1793
0d50d293 1794=head1 AUTHOR, COPYRIGHT AND LICENSE
45394607 1795
0d50d293 1796The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
211cc501 1797<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2011,
0d50d293 1798SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1799
0d50d293
RGS
1800This module is free software; you can redistribute it and/or
1801modify it under the same terms as Perl itself.
45394607 1802
00e00351
CBW
1803The file Unicode/Collate/allkeys.txt was copied verbatim
1804from L<http://www.unicode.org/Public/UCA/5.2.0/allkeys.txt>.
1805This file is Copyright (c) 1991-2009 Unicode, Inc. All rights reserved.
6d24ed10 1806Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
45394607
JH
1807
1808=head1 SEE ALSO
1809
1810=over 4
1811
91ae00cb
NC
1812=item Unicode Collation Algorithm - UTS #10
1813
1814L<http://www.unicode.org/reports/tr10/>
1815
1816=item The Default Unicode Collation Element Table (DUCET)
1817
10d7ec48 1818L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 1819
91ae00cb 1820=item The conformance test for the UCA
45394607 1821
10d7ec48 1822L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 1823
10d7ec48 1824L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 1825
91ae00cb 1826=item Hangul Syllable Type
0116f5dc 1827
10d7ec48 1828L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 1829
91ae00cb 1830=item Unicode Normalization Forms - UAX #15
a7fbee98 1831
91ae00cb 1832L<http://www.unicode.org/reports/tr15/>
a7fbee98 1833
aa7758f7
CBW
1834=item Unicode Locale Data Markup Language (LDML) - UTS #35
1835
1836L<http://www.unicode.org/reports/tr35/>
1837
45394607
JH
1838=back
1839
1840=cut