This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade Math::BigInt from vesion 1.999726(_01) to 1.999727
[perl5.git] / cpan / Unicode-Collate / Collate.pm
CommitLineData
45394607
JH
1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
d8e4b4ea
CBW
7 unless (0x41 == unpack('U', 'A')) {
8 die "Unicode::Collate cannot get a Unicode code point\n";
9 }
4a2e806c
JH
10}
11
45394607
JH
12use 5.006;
13use strict;
14use warnings;
15use Carp;
e69a2255 16use File::Spec;
5398038e 17
10d7ec48
NC
18no warnings 'utf8';
19
fcd4e2f8 20our $VERSION = '1.17';
45394607
JH
21our $PACKAGE = __PACKAGE__;
22
19265284 23### begin XS only ###
f58b9ef1
CBW
24require DynaLoader;
25our @ISA = qw(DynaLoader);
26bootstrap Unicode::Collate $VERSION;
19265284 27### end XS only ###
f58b9ef1 28
e7f779c8
RGS
29my @Path = qw(Unicode Collate);
30my $KeyFile = "allkeys.txt";
45394607 31
4d36a948
TS
32# Perl's boolean
33use constant TRUE => 1;
34use constant FALSE => "";
35use constant NOMATCHPOS => -1;
36
37# A coderef to get combining class imported from Unicode::Normalize
38# (i.e. \&Unicode::Normalize::getCombinClass).
39# This is also used as a HAS_UNICODE_NORMALIZE flag.
e7f779c8 40my $CVgetCombinClass;
4d36a948 41
9f1f04a1
RGS
42# Supported Levels
43use constant MinLevel => 1;
44use constant MaxLevel => 4;
45
4d36a948 46# Minimum weights at level 2 and 3, respectively
9f1f04a1
RGS
47use constant Min2Wt => 0x20;
48use constant Min3Wt => 0x02;
4d36a948
TS
49
50# Shifted weight at 4th level
9f1f04a1 51use constant Shift4Wt => 0xFFFF;
4d36a948 52
4d36a948 53# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
4d36a948
TS
54use constant VCE_TEMPLATE => 'Cn4';
55
4d36a948 56# A sort key: 16-bit weights
4d36a948
TS
57use constant KEY_TEMPLATE => 'n*';
58
750da838
CBW
59# The tie-breaking: 32-bit weights
60use constant TIE_TEMPLATE => 'N*';
61
4d36a948
TS
62# Level separator in a sort key:
63# i.e. pack(KEY_TEMPLATE, 0)
64use constant LEVEL_SEP => "\0\0";
65
66# As Unicode code point separator for hash keys.
67# A joined code point string (denoted by JCPS below)
68# like "65;768" is used for internal processing
69# instead of Perl's Unicode string like "\x41\x{300}",
70# as the native code point is different from the Unicode code point
71# on EBCDIC platform.
72# This character must not be included in any stringified
73# representation of an integer.
74use constant CODE_SEP => ';';
19265284 75 # NOTE: in regex /;/ is used for $jcps!
4d36a948
TS
76
77# boolean values of variable weights
0116f5dc
JH
78use constant NON_VAR => 0; # Non-Variable character
79use constant VAR => 1; # Variable character
3164dd77 80
91ae00cb 81# specific code points
00e00351
CBW
82use constant Hangul_SIni => 0xAC00;
83use constant Hangul_SFin => 0xD7A3;
584e761d 84
4d36a948 85# Logical_Order_Exception in PropList.txt
e7f779c8 86my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 87
6d3c12b5
CBW
88# for highestFFFF and minimalFFFE
89my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
90my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE);
91
fcd4e2f8 92sub UCA_Version { "32" }
a7fbee98 93
fcd4e2f8 94sub Base_Unicode_Version { "8.0.0" }
a7fbee98 95
9f1f04a1
RGS
96######
97
9f1f04a1 98sub pack_U {
ae6aa562 99 return pack('U*', @_);
9f1f04a1
RGS
100}
101
e9d2bd8a
CBW
102sub unpack_U {
103 return unpack('U*', shift(@_).pack('U*'));
104}
4779e03e 105
9f1f04a1
RGS
106######
107
91ae00cb
NC
108my (%VariableOK);
109@VariableOK{ qw/
0116f5dc 110 blanked non-ignorable shifted shift-trimmed
91ae00cb 111 / } = (); # keys lowercased
0116f5dc
JH
112
113our @ChangeOK = qw/
114 alternate backwards level normalization rearrange
19265284 115 katakana_before_hiragana upper_before_lower ignore_level2
60f577e0 116 overrideCJK overrideHangul overrideOut preprocess UCA_Version
6d3c12b5 117 hangul_terminator variable identical highestFFFF minimalFFFE
f8187d97 118 long_contraction
0116f5dc
JH
119 /;
120
121our @ChangeNG = qw/
b5d9a953 122 entry mapping table maxlength contraction
19265284
CBW
123 ignoreChar ignoreName undefChar undefName rewrite
124 versionTable alternateTable backwardsTable forwardsTable
125 rearrangeTable variableTable
00e00351 126 derivCode normCode rearrangeHash backwardsFlag
aa7758f7 127 suppress suppressHash
19265284 128 __useXS /; ### XS only
ed2081ad
CBW
129# The hash key 'ignored' was deleted at v 0.21.
130# The hash key 'isShift' was deleted at v 0.23.
131# The hash key 'combining' was deleted at v 0.24.
132# The hash key 'entries' was deleted at v 0.30.
133# The hash key 'L3_ignorable' was deleted at v 0.40.
91ae00cb
NC
134
135sub version {
136 my $self = shift;
137 return $self->{versionTable} || 'unknown';
138}
0116f5dc
JH
139
140my (%ChangeOK, %ChangeNG);
141@ChangeOK{ @ChangeOK } = ();
142@ChangeNG{ @ChangeNG } = ();
143
144sub change {
145 my $self = shift;
146 my %hash = @_;
147 my %old;
750da838
CBW
148 if (exists $hash{alternate}) {
149 if (exists $hash{variable}) {
150 delete $hash{alternate};
151 } else {
152 $hash{variable} = $hash{alternate};
153 }
91ae00cb 154 }
0116f5dc
JH
155 foreach my $k (keys %hash) {
156 if (exists $ChangeOK{$k}) {
157 $old{$k} = $self->{$k};
158 $self->{$k} = $hash{$k};
750da838 159 } elsif (exists $ChangeNG{$k}) {
0116f5dc
JH
160 croak "change of $k via change() is not allowed!";
161 }
162 # else => ignored
163 }
3756e7ca 164 $self->checkCollator();
0116f5dc
JH
165 return wantarray ? %old : $self;
166}
a7fbee98 167
9f1f04a1
RGS
168sub _checkLevel {
169 my $level = shift;
abd1ec54
NC
170 my $key = shift; # 'level' or 'backwards'
171 MinLevel <= $level or croak sprintf
172 "Illegal level %d (in value for key '%s') lower than %d.",
173 $level, $key, MinLevel;
174 $level <= MaxLevel or croak sprintf
175 "Unsupported level %d (in value for key '%s') higher than %d.",
176 $level, $key, MaxLevel;
9f1f04a1
RGS
177}
178
91ae00cb
NC
179my %DerivCode = (
180 8 => \&_derivCE_8,
181 9 => \&_derivCE_9,
182 11 => \&_derivCE_9, # 11 == 9
3756e7ca 183 14 => \&_derivCE_14,
74b94a79
CBW
184 16 => \&_derivCE_14, # 16 == 14
185 18 => \&_derivCE_18,
00e00351 186 20 => \&_derivCE_20,
b5d9a953 187 22 => \&_derivCE_22,
cba8842c 188 24 => \&_derivCE_24,
750da838 189 26 => \&_derivCE_24, # 26 == 24
2b488730 190 28 => \&_derivCE_24, # 28 == 24
c28567dd 191 30 => \&_derivCE_24, # 30 == 24
fcd4e2f8 192 32 => \&_derivCE_32,
91ae00cb
NC
193);
194
0116f5dc
JH
195sub checkCollator {
196 my $self = shift;
9f1f04a1 197 _checkLevel($self->{level}, "level");
a7fbee98 198
91ae00cb
NC
199 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
200 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 201
91ae00cb 202 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 203 $self->{alternateTable} || 'shifted';
91ae00cb
NC
204 $self->{variable} = $self->{alternate} = lc($self->{variable});
205 exists $VariableOK{ $self->{variable} }
3756e7ca 206 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
0116f5dc 207
9f1f04a1
RGS
208 if (! defined $self->{backwards}) {
209 $self->{backwardsFlag} = 0;
750da838 210 } elsif (! ref $self->{backwards}) {
9f1f04a1
RGS
211 _checkLevel($self->{backwards}, "backwards");
212 $self->{backwardsFlag} = 1 << $self->{backwards};
750da838 213 } else {
9f1f04a1
RGS
214 my %level;
215 $self->{backwardsFlag} = 0;
216 for my $b (@{ $self->{backwards} }) {
217 _checkLevel($b, "backwards");
218 $level{$b} = 1;
219 }
220 for my $v (sort keys %level) {
221 $self->{backwardsFlag} += 1 << $v;
222 }
223 }
0116f5dc 224
91ae00cb
NC
225 defined $self->{rearrange} or $self->{rearrange} = [];
226 ref $self->{rearrange}
227 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc
JH
228
229 # keys of $self->{rearrangeHash} are $self->{rearrange}.
230 $self->{rearrangeHash} = undef;
231
232 if (@{ $self->{rearrange} }) {
233 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
234 }
235
236 $self->{normCode} = undef;
a7fbee98
JH
237
238 if (defined $self->{normalization}) {
239 eval { require Unicode::Normalize };
91ae00cb 240 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 241
91ae00cb 242 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 243
91ae00cb
NC
244 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
245 $self->{normCode} = \&Unicode::Normalize::NFD;
246 }
247 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f
RGS
248 my $norm = $self->{normalization};
249 $self->{normCode} = sub {
1d2654e1
JH
250 Unicode::Normalize::normalize($norm, shift);
251 };
06c8fc8f
RGS
252 eval { $self->{normCode}->("") }; # try
253 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 254 }
a7fbee98 255 }
0116f5dc
JH
256 return;
257}
258
259sub new
260{
261 my $class = shift;
262 my $self = bless { @_ }, $class;
45394607 263
19265284
CBW
264### begin XS only ###
265 if (! exists $self->{table} && !defined $self->{rewrite} &&
f58b9ef1
CBW
266 !defined $self->{undefName} && !defined $self->{ignoreName} &&
267 !defined $self->{undefChar} && !defined $self->{ignoreChar}) {
268 $self->{__useXS} = \&_fetch_simple;
19265284
CBW
269 } else {
270 $self->{__useXS} = undef;
271 }
272### end XS only ###
f58b9ef1 273
aa7758f7
CBW
274 # keys of $self->{suppressHash} are $self->{suppress}.
275 if ($self->{suppress} && @{ $self->{suppress} }) {
276 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
277 } # before read_table()
278
a7fbee98 279 # If undef is passed explicitly, no file is read.
0116f5dc 280 $self->{table} = $KeyFile if ! exists $self->{table};
3756e7ca 281 $self->read_table() if defined $self->{table};
905aa9f0 282
a7fbee98 283 if ($self->{entry}) {
e7f779c8 284 while ($self->{entry} =~ /([^\n]+)/g) {
bd65daab 285 $self->parseEntry($1, TRUE);
e7f779c8 286 }
a7fbee98 287 }
905aa9f0 288
f8187d97 289 # only in new(), not in change()
9f1f04a1 290 $self->{level} ||= MaxLevel;
0116f5dc 291 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 292
abd1ec54 293 $self->{overrideHangul} = FALSE
0116f5dc 294 if ! exists $self->{overrideHangul};
abd1ec54 295 $self->{overrideCJK} = FALSE
0116f5dc 296 if ! exists $self->{overrideCJK};
06c8fc8f 297 $self->{normalization} = 'NFD'
0116f5dc 298 if ! exists $self->{normalization};
3756e7ca
RGS
299 $self->{rearrange} = $self->{rearrangeTable} ||
300 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
a7fbee98 301 if ! exists $self->{rearrange};
0116f5dc
JH
302 $self->{backwards} = $self->{backwardsTable}
303 if ! exists $self->{backwards};
f8187d97
SH
304 exists $self->{long_contraction} or $self->{long_contraction}
305 = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
a7fbee98 306
f8187d97 307 # checkCollator() will be called in change()
3756e7ca 308 $self->checkCollator();
a7fbee98
JH
309
310 return $self;
311}
905aa9f0 312
00e00351
CBW
313sub parseAtmark {
314 my $self = shift;
315 my $line = shift; # after s/^\s*\@//
316
317 if ($line =~ /^version\s*(\S*)/) {
318 $self->{versionTable} ||= $1;
319 }
320 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
321 $self->{variableTable} ||= $1;
322 }
323 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
324 $self->{alternateTable} ||= $1;
325 }
326 elsif ($line =~ /^backwards\s+(\S*)/) {
327 push @{ $self->{backwardsTable} }, $1;
328 }
ed2081ad 329 elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
00e00351
CBW
330 push @{ $self->{forwardsTable} }, $1;
331 }
332 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
333 push @{ $self->{rearrangeTable} }, _getHexArray($1);
334 }
335}
336
905aa9f0 337sub read_table {
a7fbee98 338 my $self = shift;
a7fbee98 339
19265284 340### begin XS only ###
f58b9ef1
CBW
341 if ($self->{__useXS}) {
342 my @rest = _fetch_rest(); # complex matter need to parse
343 for my $line (@rest) {
344 next if $line =~ /^\s*#/;
345
346 if ($line =~ s/^\s*\@//) {
347 $self->parseAtmark($line);
348 } else {
349 $self->parseEntry($line);
350 }
351 }
352 return;
353 }
19265284 354### end XS only ###
f58b9ef1 355
e7f779c8
RGS
356 my($f, $fh);
357 foreach my $d (@INC) {
358 $f = File::Spec->catfile($d, @Path, $self->{table});
359 last if open($fh, $f);
360 $f = undef;
361 }
0d50d293
RGS
362 if (!defined $f) {
363 $f = File::Spec->catfile(@Path, $self->{table});
364 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
365 }
a7fbee98 366
6d24ed10
SP
367 while (my $line = <$fh>) {
368 next if $line =~ /^\s*#/;
abd1ec54 369
00e00351
CBW
370 if ($line =~ s/^\s*\@//) {
371 $self->parseAtmark($line);
372 } else {
373 $self->parseEntry($line);
abd1ec54 374 }
45394607 375 }
e7f779c8 376 close $fh;
45394607
JH
377}
378
905aa9f0 379
45394607
JH
380##
381## get $line, parse it, and write an entry in $self
382##
383sub parseEntry
384{
a7fbee98
JH
385 my $self = shift;
386 my $line = shift;
bd65daab 387 my $tailoring = shift;
4d36a948 388 my($name, $entry, @uv, @key);
a7fbee98 389
19265284
CBW
390 if (defined $self->{rewrite}) {
391 $line = $self->{rewrite}->($line);
392 }
393
a7fbee98
JH
394 return if $line !~ /^\s*[0-9A-Fa-f]/;
395
396 # removes comment and gets name
397 $name = $1
398 if $line =~ s/[#%]\s*(.*)//;
399 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
400
401 # gets element
402 my($e, $k) = split /;/, $line;
403 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
404 if ! $k;
405
4d36a948
TS
406 @uv = _getHexArray($e);
407 return if !@uv;
bd65daab 408 return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
aa7758f7 409 exists $self->{suppressHash}{$uv[0]};
4d36a948 410 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 411
4d36a948 412 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 413 my $ele = pack_U(@uv);
a7fbee98 414
ed2081ad 415 # regarded as if it were not stored in the table
4d36a948
TS
416 return
417 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 418
4d36a948
TS
419 # replaced as completely ignorable
420 $k = '[.0000.0000.0000.0000]'
421 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 422 }
0116f5dc 423
4d36a948
TS
424 # replaced as completely ignorable
425 $k = '[.0000.0000.0000.0000]'
426 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
427
4c843366 428 my $is_L3_ignorable = TRUE;
4d36a948 429
caffd4cf
TS
430 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
431 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948
TS
432 my @wt = _getHexArray($arr);
433 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 434 $is_L3_ignorable = FALSE
3756e7ca
RGS
435 if $wt[0] || $wt[1] || $wt[2];
436 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
437 # is completely ignorable.
4c843366
JH
438 # For expansion, an entry $is_L3_ignorable
439 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 440 }
caffd4cf 441
e7f779c8 442 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
caffd4cf 443
91ae00cb 444 if (@uv > 1) {
b5d9a953
CBW
445 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
446 $self->{maxlength}{$uv[0]} = @uv;
447 }
448 }
f8187d97
SH
449 while (@uv > 2) {
450 pop @uv;
451 my $fake_entry = join(CODE_SEP, @uv); # in JCPS
452 $self->{contraction}{$fake_entry} = 1;
91ae00cb 453 }
45394607
JH
454}
455
9f1f04a1 456
45394607
JH
457sub viewSortKey
458{
a7fbee98 459 my $self = shift;
750da838
CBW
460 my $str = shift;
461 $self->visualizeSortKey($self->getSortKey($str));
9f1f04a1 462}
0116f5dc 463
d16e9e3d 464
750da838
CBW
465sub process
466{
467 my $self = shift;
468 my $str = shift;
469 my $prep = $self->{preprocess};
470 my $norm = $self->{normCode};
471
472 $str = &$prep($str) if ref $prep;
473 $str = &$norm($str) if ref $norm;
474 return $str;
475}
476
45394607 477##
91ae00cb 478## arrayref of JCPS = splitEnt(string to be collated)
750da838 479## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
45394607 480##
91ae00cb 481sub splitEnt
45394607 482{
a7fbee98 483 my $self = shift;
750da838
CBW
484 my $str = shift;
485 my $wLen = shift; # with Length
4d36a948 486
91ae00cb 487 my $map = $self->{mapping};
a7fbee98
JH
488 my $max = $self->{maxlength};
489 my $reH = $self->{rearrangeHash};
b5d9a953
CBW
490 my $vers = $self->{UCA_Version};
491 my $ver9 = $vers >= 9 && $vers <= 11;
f8187d97 492 my $long = $self->{long_contraction};
19265284 493 my $uXS = $self->{__useXS}; ### XS only
a7fbee98 494
750da838 495 my @buf;
a7fbee98 496
4d36a948 497 # get array of Unicode code point of string.
9f1f04a1 498 my @src = unpack_U($str);
4d36a948
TS
499
500 # rearrangement:
501 # Character positions are not kept if rearranged,
502 # then neglected if $wLen is true.
503 if ($reH && ! $wLen) {
a7fbee98
JH
504 for (my $i = 0; $i < @src; $i++) {
505 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
506 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
507 $i++;
508 }
509 }
45394607 510 }
45394607 511
3756e7ca 512 # remove a code point marked as a completely ignorable.
abd1ec54 513 for (my $i = 0; $i < @src; $i++) {
60f577e0 514 if ($vers <= 20 && _isIllegal($src[$i])) {
b5d9a953
CBW
515 $src[$i] = undef;
516 } elsif ($ver9) {
6d3c12b5
CBW
517 $src[$i] = undef if $map->{ $src[$i] }
518 ? @{ $map->{ $src[$i] } } == 0
f443a335 519 : $uXS && _ignorable_simple($src[$i]); ### XS only
b5d9a953 520 }
0116f5dc
JH
521 }
522
a7fbee98 523 for (my $i = 0; $i < @src; $i++) {
91ae00cb 524 my $jcps = $src[$i];
3756e7ca
RGS
525
526 # skip removed code point
527 if (! defined $jcps) {
528 if ($wLen && @buf) {
529 $buf[-1][2] = $i + 1;
530 }
531 next;
532 }
533
abd1ec54 534 my $i_orig = $i;
4d36a948 535
3756e7ca
RGS
536 # find contraction
537 if ($max->{$jcps}) {
91ae00cb
NC
538 my $temp_jcps = $jcps;
539 my $jcpsLen = 1;
540 my $maxLen = $max->{$jcps};
4d36a948 541
91ae00cb 542 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 543 next if ! defined $src[$p];
91ae00cb
NC
544 $temp_jcps .= CODE_SEP . $src[$p];
545 $jcpsLen++;
546 if ($map->{$temp_jcps}) {
547 $jcps = $temp_jcps;
4d36a948
TS
548 $i = $p;
549 }
550 }
4d36a948 551
b5d9a953 552 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
06c8fc8f 553 # This process requires Unicode::Normalize.
91ae00cb 554 # If "normalization" is undef, here should be skipped *always*
06c8fc8f
RGS
555 # (in spite of bool value of $CVgetCombinClass),
556 # since canonical ordering cannot be expected.
557 # Blocked combining character should not be contracted.
558
06c8fc8f 559 # $self->{normCode} is false in the case of "prenormalized".
b5d9a953
CBW
560 if ($self->{normalization}) {
561 my $cont = $self->{contraction};
06c8fc8f 562 my $preCC = 0;
b5d9a953
CBW
563 my $preCC_uc = 0;
564 my $jcps_uc = $jcps;
565 my(@out, @out_uc);
06c8fc8f
RGS
566
567 for (my $p = $i + 1; $p < @src; $p++) {
568 next if ! defined $src[$p];
b5d9a953 569 my $curCC = $CVgetCombinClass->($src[$p]);
06c8fc8f
RGS
570 last unless $curCC;
571 my $tail = CODE_SEP . $src[$p];
b5d9a953 572
f8187d97
SH
573 if ($preCC != $curCC && $map->{$jcps.$tail}) {
574 $jcps .= $tail;
575 push @out, $p;
576 } else {
577 $preCC = $curCC;
578 }
579
580 next if !$long;
581
b5d9a953
CBW
582 if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
583 $cont->{$jcps_uc.$tail})) {
584 $jcps_uc .= $tail;
585 push @out_uc, $p;
586 } else {
587 $preCC_uc = $curCC;
588 }
4d36a948 589 }
b5d9a953 590
f8187d97 591 if (@out_uc && $map->{$jcps_uc}) {
b5d9a953
CBW
592 $jcps = $jcps_uc;
593 $src[$_] = undef for @out_uc;
594 } else {
595 $src[$_] = undef for @out;
596 }
a7fbee98 597 }
a7fbee98
JH
598 }
599
3756e7ca 600 # skip completely ignorable
6d3c12b5
CBW
601 if ($map->{$jcps} ? @{ $map->{$jcps} } == 0 :
602 $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
3756e7ca
RGS
603 if ($wLen && @buf) {
604 $buf[-1][2] = $i + 1;
a7fbee98 605 }
3756e7ca 606 next;
a7fbee98 607 }
4d36a948 608
91ae00cb 609 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 610 }
4d36a948 611 return \@buf;
d16e9e3d 612}
45394607 613
539ce3d8
CBW
614##
615## VCE = _pack_override(input, codepoint, derivCode)
616##
617sub _pack_override ($$$) {
618 my $r = shift;
619 my $u = shift;
620 my $der = shift;
621
622 if (ref $r) {
623 return pack(VCE_TEMPLATE, NON_VAR, @$r);
624 } elsif (defined $r) {
625 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
626 } else {
d8e4b4ea 627 $u = 0xFFFD if 0x10FFFF < $u;
539ce3d8
CBW
628 return $der->($u);
629 }
630}
d16e9e3d
JH
631
632##
abd1ec54 633## list of VCE = getWt(JCPS)
d16e9e3d
JH
634##
635sub getWt
636{
a7fbee98 637 my $self = shift;
91ae00cb
NC
638 my $u = shift;
639 my $map = $self->{mapping};
0116f5dc 640 my $der = $self->{derivCode};
60f577e0 641 my $out = $self->{overrideOut};
19265284 642 my $uXS = $self->{__useXS}; ### XS only
a7fbee98 643
91ae00cb 644 return if !defined $u;
6d3c12b5
CBW
645 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
646 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
60f577e0
SH
647 $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
648
649 my @ce;
650 if ($map->{$u}) {
651 @ce = @{ $map->{$u} }; # $u may be a contraction
19265284 652### begin XS only ###
60f577e0
SH
653 } elsif ($uXS && _exists_simple($u)) {
654 @ce = _fetch_simple($u);
19265284 655### end XS only ###
60f577e0 656 } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1 657 my $hang = $self->{overrideHangul};
1d2654e1 658 if ($hang) {
60f577e0 659 @ce = map _pack_override($_, $u, $der), $hang->($u);
539ce3d8 660 } elsif (!defined $hang) {
60f577e0 661 @ce = $der->($u);
539ce3d8 662 } else {
1d2654e1
JH
663 my $max = $self->{maxlength};
664 my @decH = _decompHangul($u);
665
666 if (@decH == 2) {
667 my $contract = join(CODE_SEP, @decH);
91ae00cb 668 @decH = ($contract) if $map->{$contract};
1d2654e1
JH
669 } else { # must be <@decH == 3>
670 if ($max->{$decH[0]}) {
671 my $contract = join(CODE_SEP, @decH);
91ae00cb 672 if ($map->{$contract}) {
1d2654e1
JH
673 @decH = ($contract);
674 } else {
675 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 676 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1
JH
677 }
678 # even if V's ignorable, LT contraction is not supported.
f58b9ef1 679 # If such a situation were required, NFD should be used.
1d2654e1
JH
680 }
681 if (@decH == 3 && $max->{$decH[1]}) {
682 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 683 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1
JH
684 }
685 }
686
60f577e0 687 @ce = map({
f58b9ef1 688 $map->{$_} ? @{ $map->{$_} } :
19265284 689 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
f58b9ef1 690 $der->($_);
1d2654e1
JH
691 } @decH);
692 }
60f577e0
SH
693 } elsif ($out && 0x10FFFF < $u) {
694 @ce = map _pack_override($_, $u, $der), $out->($u);
539ce3d8 695 } else {
584e761d
CBW
696 my $cjk = $self->{overrideCJK};
697 my $vers = $self->{UCA_Version};
698 if ($cjk && _isUIdeo($u, $vers)) {
60f577e0
SH
699 @ce = map _pack_override($_, $u, $der), $cjk->($u);
700 } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
701 @ce = _uideoCE_8($u);
702 } else {
703 @ce = $der->($u);
584e761d 704 }
a7fbee98 705 }
60f577e0 706 return map $self->varCE($_), @ce;
d16e9e3d
JH
707}
708
d16e9e3d
JH
709
710##
711## string sortkey = getSortKey(string arg)
712##
713sub getSortKey
714{
a7fbee98 715 my $self = shift;
750da838
CBW
716 my $orig = shift;
717 my $str = $self->process($orig);
718 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
00e00351 719 my $vers = $self->{UCA_Version};
aa7758f7 720 my $term = $self->{hangul_terminator};
750da838
CBW
721 my $lev = $self->{level};
722 my $iden = $self->{identical};
91ae00cb 723
abd1ec54 724 my @buf; # weight arrays
aa7758f7 725 if ($term) {
91ae00cb 726 my $preHST = '';
19265284 727 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
91ae00cb
NC
728 foreach my $jcps (@$rEnt) {
729 # weird things like VL, TL-contraction are not considered!
aa7758f7 730 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
91ae00cb
NC
731 if ($preHST && !$curHST || # hangul before non-hangul
732 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
733 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
734 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
aa7758f7 735 push @buf, $termCE;
91ae00cb
NC
736 }
737 $preHST = $curHST;
abd1ec54 738 push @buf, $self->getWt($jcps);
91ae00cb 739 }
aa7758f7
CBW
740 push @buf, $termCE if $preHST; # end at hangul
741 } else {
91ae00cb 742 foreach my $jcps (@$rEnt) {
abd1ec54 743 push @buf, $self->getWt($jcps);
91ae00cb
NC
744 }
745 }
746
750da838
CBW
747 my $rkey = $self->mk_SortKey(\@buf); ### XS only
748
749 if ($iden || $vers >= 26 && $lev == MaxLevel) {
750 $rkey .= LEVEL_SEP;
751 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
752 }
753 return $rkey;
45394607
JH
754}
755
756
757##
d16e9e3d 758## int compare = cmp(string a, string b)
45394607 759##
5398038e
TS
760sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
761sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
762sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
763sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
764sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
765sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
766sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607
JH
767
768##
d16e9e3d 769## list[strings] sorted = sort(list[strings] arg)
45394607 770##
a7fbee98
JH
771sub sort {
772 my $obj = shift;
773 return
774 map { $_->[1] }
775 sort{ $a->[0] cmp $b->[0] }
776 map [ $obj->getSortKey($_), $_ ], @_;
45394607
JH
777}
778
0116f5dc 779
4d36a948
TS
780##
781## bool _nonIgnorAtLevel(arrayref weights, int level)
782##
783sub _nonIgnorAtLevel($$)
784{
785 my $wt = shift;
786 return if ! defined $wt;
787 my $lv = shift;
9f1f04a1 788 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948
TS
789}
790
791##
792## bool _eqArray(
793## arrayref of arrayref[weights] source,
794## arrayref of arrayref[weights] substr,
795## int level)
796## * comparison of graphemes vs graphemes.
797## @$source >= @$substr must be true (check it before call this);
798##
799sub _eqArray($$$)
800{
801 my $source = shift;
802 my $substr = shift;
803 my $lev = shift;
804
805 for my $g (0..@$substr-1){
f58b9ef1 806 # Do the $g'th graphemes have the same number of AV weights?
4d36a948
TS
807 return if @{ $source->[$g] } != @{ $substr->[$g] };
808
809 for my $w (0..@{ $substr->[$g] }-1) {
810 for my $v (0..$lev-1) {
811 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
812 }
813 }
814 }
815 return 1;
816}
817
818##
819## (int position, int length)
17609435 820## int position = index(string, substring, position, [undoc'ed global])
4d36a948 821##
17609435 822## With "global" (only for the list context),
4d36a948
TS
823## returns list of arrayref[position, length].
824##
825sub index
826{
91ae00cb 827 my $self = shift;
750da838
CBW
828 $self->{preprocess} and
829 croak "Don't use Preprocess with index(), match(), etc.";
830 $self->{normCode} and
831 croak "Don't use Normalization with index(), match(), etc.";
832
91ae00cb
NC
833 my $str = shift;
834 my $len = length($str);
750da838
CBW
835 my $sub = shift;
836 my $subE = $self->splitEnt($sub);
91ae00cb
NC
837 my $pos = @_ ? shift : 0;
838 $pos = 0 if $pos < 0;
17609435 839 my $glob = shift;
91ae00cb
NC
840
841 my $lev = $self->{level};
3756e7ca
RGS
842 my $v2i = $self->{UCA_Version} >= 9 &&
843 $self->{variable} ne 'non-ignorable';
91ae00cb
NC
844
845 if (! @$subE) {
4d36a948 846 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
17609435 847 return $glob
4d36a948
TS
848 ? map([$_, 0], $temp..$len)
849 : wantarray ? ($temp,0) : $temp;
850 }
abd1ec54
NC
851 $len < $pos
852 and return wantarray ? () : NOMATCHPOS;
91ae00cb 853 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
abd1ec54
NC
854 @$strE
855 or return wantarray ? () : NOMATCHPOS;
856
4d36a948
TS
857 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
858
abd1ec54
NC
859 my $last_is_variable;
860 for my $vwt (map $self->getWt($_), @$subE) {
861 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
862 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 863
3756e7ca
RGS
864 # "Ignorable (L1, L2) after Variable" since track. v. 9
865 if ($v2i) {
abd1ec54
NC
866 if ($var) {
867 $last_is_variable = TRUE;
868 }
869 elsif (!$wt[0]) { # ignorable
4d36a948 870 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
871 }
872 else {
873 $last_is_variable = FALSE;
4d36a948
TS
874 }
875 }
876
abd1ec54
NC
877 if (@subWt && !$var && !$wt[0]) {
878 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
1393fe00 879 } elsif ($to_be_pushed) {
abd1ec54 880 push @subWt, [ \@wt ];
4d36a948 881 }
1393fe00 882 # else ===> skipped
4d36a948
TS
883 }
884
885 my $count = 0;
91ae00cb 886 my $end = @$strE - 1;
4d36a948 887
abd1ec54 888 $last_is_variable = FALSE; # reuse
4d36a948
TS
889 for (my $i = 0; $i <= $end; ) { # no $i++
890 my $found_base = 0;
891
892 # fetch a grapheme
893 while ($i <= $end && $found_base == 0) {
abd1ec54
NC
894 for my $vwt ($self->getWt($strE->[$i][0])) {
895 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
896 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 897
3756e7ca
RGS
898 # "Ignorable (L1, L2) after Variable" since track. v. 9
899 if ($v2i) {
abd1ec54
NC
900 if ($var) {
901 $last_is_variable = TRUE;
902 }
903 elsif (!$wt[0]) { # ignorable
4d36a948 904 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
905 }
906 else {
907 $last_is_variable = FALSE;
4d36a948
TS
908 }
909 }
910
abd1ec54
NC
911 if (@strWt && !$var && !$wt[0]) {
912 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
91ae00cb 913 $finPos[-1] = $strE->[$i][2];
4d36a948 914 } elsif ($to_be_pushed) {
abd1ec54 915 push @strWt, [ \@wt ];
91ae00cb 916 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 917 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 918 push @finPos, $strE->[$i][2];
4d36a948
TS
919 $found_base++;
920 }
921 # else ===> no-op
922 }
923 $i++;
924 }
925
926 # try to match
927 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
928 if ($iniPos[0] != NOMATCHPOS &&
929 $finPos[$#subWt] != NOMATCHPOS &&
930 _eqArray(\@strWt, \@subWt, $lev)) {
931 my $temp = $iniPos[0] + $pos;
932
17609435 933 if ($glob) {
4d36a948
TS
934 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
935 splice @strWt, 0, $#subWt;
936 splice @iniPos, 0, $#subWt;
937 splice @finPos, 0, $#subWt;
938 }
939 else {
940 return wantarray
941 ? ($temp, $finPos[$#subWt] - $iniPos[0])
942 : $temp;
943 }
944 }
945 shift @strWt;
946 shift @iniPos;
947 shift @finPos;
948 }
949 }
950
17609435 951 return $glob
4d36a948
TS
952 ? @g_ret
953 : wantarray ? () : NOMATCHPOS;
954}
955
956##
957## scalarref to matching part = match(string, substring)
958##
959sub match
960{
961 my $self = shift;
962 if (my($pos,$len) = $self->index($_[0], $_[1])) {
963 my $temp = substr($_[0], $pos, $len);
964 return wantarray ? $temp : \$temp;
965 # An lvalue ref \substr should be avoided,
966 # since its value is affected by modification of its referent.
967 }
968 else {
969 return;
970 }
971}
972
973##
974## arrayref matching parts = gmatch(string, substring)
975##
976sub gmatch
977{
978 my $self = shift;
979 my $str = shift;
980 my $sub = shift;
981 return map substr($str, $_->[0], $_->[1]),
982 $self->index($str, $sub, 0, 'g');
983}
984
985##
986## bool subst'ed = subst(string, substring, replace)
987##
988sub subst
989{
990 my $self = shift;
991 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
992
993 if (my($pos,$len) = $self->index($_[0], $_[1])) {
994 if ($code) {
995 my $mat = substr($_[0], $pos, $len);
996 substr($_[0], $pos, $len, $code->($mat));
997 } else {
998 substr($_[0], $pos, $len, $_[2]);
999 }
1000 return TRUE;
1001 }
1002 else {
1003 return FALSE;
1004 }
1005}
1006
1007##
1008## int count = gsubst(string, substring, replace)
1009##
1010sub gsubst
1011{
1012 my $self = shift;
1013 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1014 my $cnt = 0;
1015
1016 # Replacement is carried out from the end, then use reverse.
1017 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1018 if ($code) {
1019 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1020 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1021 } else {
1022 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1023 }
1024 $cnt++;
1025 }
1026 return $cnt;
1027}
1028
45394607
JH
10291;
1030__END__
1031
1032=head1 NAME
1033
a7fbee98 1034Unicode::Collate - Unicode Collation Algorithm
45394607
JH
1035
1036=head1 SYNOPSIS
1037
1038 use Unicode::Collate;
1039
1040 #construct
5398038e 1041 $Collator = Unicode::Collate->new(%tailoring);
45394607
JH
1042
1043 #sort
5398038e 1044 @sorted = $Collator->sort(@not_sorted);
45394607
JH
1045
1046 #compare
a7fbee98 1047 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1048
539ce3d8
CBW
1049B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1050according to Perl's Unicode support. See L<perlunicode>,
1051L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1052Otherwise you can use C<preprocess> or should decode them before.
91ae00cb 1053
45394607
JH
1054=head1 DESCRIPTION
1055
3756e7ca
RGS
1056This module is an implementation of Unicode Technical Standard #10
1057(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
4d36a948 1058
45394607
JH
1059=head2 Constructor and Tailoring
1060
539ce3d8
CBW
1061The C<new> method returns a collator object. If new() is called
1062with no parameters, the collator should do the default collation.
d16e9e3d 1063
5398038e 1064 $Collator = Unicode::Collate->new(
0116f5dc 1065 UCA_Version => $UCA_Version,
aa7758f7 1066 alternate => $alternate, # alias for 'variable'
45394607
JH
1067 backwards => $levelNumber, # or \@levelNumbers
1068 entry => $element,
91ae00cb 1069 hangul_terminator => $term_primary_weight,
6d3c12b5 1070 highestFFFF => $bool,
750da838 1071 identical => $bool,
45394607
JH
1072 ignoreName => qr/$ignoreName/,
1073 ignoreChar => qr/$ignoreChar/,
19265284 1074 ignore_level2 => $bool,
45394607
JH
1075 katakana_before_hiragana => $bool,
1076 level => $collationLevel,
f8187d97 1077 long_contraction => $bool,
6d3c12b5 1078 minimalFFFE => $bool,
91ae00cb 1079 normalization => $normalization_form,
45394607
JH
1080 overrideCJK => \&overrideCJK,
1081 overrideHangul => \&overrideHangul,
1082 preprocess => \&preprocess,
1083 rearrange => \@charList,
19265284 1084 rewrite => \&rewrite,
aa7758f7 1085 suppress => \@charList,
45394607
JH
1086 table => $filename,
1087 undefName => qr/$undefName/,
1088 undefChar => qr/$undefChar/,
1089 upper_before_lower => $bool,
91ae00cb 1090 variable => $variable,
45394607 1091 );
45394607
JH
1092
1093=over 4
1094
0116f5dc
JH
1095=item UCA_Version
1096
7b98b857
CBW
1097If the revision (previously "tracking version") number of UCA is given,
1098behavior of that revision is emulated on collating.
0116f5dc 1099If omitted, the return value of C<UCA_Version()> is used.
3756e7ca 1100
fcd4e2f8 1101The following revisions are supported. The default is 32.
3756e7ca 1102
0d50d293 1103 UCA Unicode Standard DUCET (@version)
b5d9a953 1104 -------------------------------------------------------
0d50d293
RGS
1105 8 3.1 3.0.1 (3.0.1d9)
1106 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1107 11 4.0 4.0.0 (4.0.0)
1108 14 4.1.0 4.1.0 (4.1.0)
74b94a79
CBW
1109 16 5.0 5.0.0 (5.0.0)
1110 18 5.1.0 5.1.0 (5.1.0)
00e00351 1111 20 5.2.0 5.2.0 (5.2.0)
b5d9a953 1112 22 6.0.0 6.0.0 (6.0.0)
cba8842c 1113 24 6.1.0 6.1.0 (6.1.0)
750da838 1114 26 6.2.0 6.2.0 (6.2.0)
2b488730 1115 28 6.3.0 6.3.0 (6.3.0)
c28567dd 1116 30 7.0.0 7.0.0 (7.0.0)
fcd4e2f8 1117 32 8.0.0 8.0.0 (8.0.0)
3756e7ca 1118
f8187d97
SH
1119* See below C<long_contraction> with C<UCA_Version> 22 and 24.
1120
f58b9ef1 1121* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
b5d9a953
CBW
1122since C<UCA_Version> 22.
1123
60f577e0
SH
1124* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
1125and can be overridden since C<UCA_Version> 22.
1126
b5d9a953
CBW
1127* Fully ignorable characters were ignored, and would not interrupt
1128contractions with C<UCA_Version> 9 and 11.
1129
1130* Treatment of ignorables after variables and some behaviors
1131were changed at C<UCA_Version> 9.
1132
1133* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1134depend on C<UCA_Version>.
1135
1136* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1137C<hangul_terminator>.
1138
3756e7ca 1139=item alternate
0116f5dc 1140
3756e7ca 1141-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
0116f5dc 1142
3756e7ca
RGS
1143For backward compatibility, C<alternate> (old name) can be used
1144as an alias for C<variable>.
0116f5dc 1145
45394607
JH
1146=item backwards
1147
750da838 1148-- see 3.4 Backward Accents, UTS #10.
45394607
JH
1149
1150 backwards => $levelNumber or \@levelNumbers
1151
1152Weights in reverse order; ex. level 2 (diacritic ordering) in French.
68adb2b0
CBW
1153If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1154forwards at all the levels.
45394607
JH
1155
1156=item entry
1157
f8187d97 1158-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
45394607 1159
91ae00cb
NC
1160If the same character (or a sequence of characters) exists
1161in the collation element table through C<table>,
f58b9ef1 1162mapping to collation elements is overridden.
91ae00cb 1163If it does not exist, the mapping is defined additionally.
45394607 1164
abd1ec54
NC
1165 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11660063 0068 ; [.0E6A.0020.0002.0063] # ch
11670043 0068 ; [.0E6A.0020.0007.0043] # Ch
11680043 0048 ; [.0E6A.0020.0008.0043] # CH
1169006C 006C ; [.0F4C.0020.0002.006C] # ll
1170004C 006C ; [.0F4C.0020.0007.004C] # Ll
1171004C 004C ; [.0F4C.0020.0008.004C] # LL
e7f779c8
RGS
117200F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1173006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
117400D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1175004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
abd1ec54
NC
1176ENTRY
1177
1178 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
117900E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
118000C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1181ENTRY
45394607 1182
4d36a948 1183B<NOTE:> The code point in the UCA file format (before C<';'>)
abd1ec54
NC
1184B<must> be a Unicode code point (defined as hexadecimal),
1185but not a native code point.
4d36a948
TS
1186So C<0063> must always denote C<U+0063>,
1187but not a character of C<"\x63">.
1188
abd1ec54
NC
1189Weighting may vary depending on collation element table.
1190So ensure the weights defined in C<entry> will be consistent with
1191those in the collation element table loaded via C<table>.
1192
1193In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1194and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1195(as a value between C<0E60> and C<0E6D>)
1196makes ordering as C<C E<lt> CH E<lt> D>.
1197Exactly speaking DUCET already has some characters between C<C> and C<D>:
1198C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1199C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1200and C<c-curl> (C<U+0255>) with C<0E69>.
1201Then primary weight C<0E6A> for C<CH> makes C<CH>
1202ordered between C<c-curl> and C<D>.
1203
91ae00cb
NC
1204=item hangul_terminator
1205
3756e7ca 1206-- see 7.1.4 Trailing Weights, UTS #10.
91ae00cb
NC
1207
1208If a true value is given (non-zero but should be positive),
1209it will be added as a terminator primary weight to the end of
1210every standard Hangul syllable. Secondary and any higher weights
1211for terminator are set to zero.
1212If the value is false or C<hangul_terminator> key does not exist,
1213insertion of terminator weights will not be performed.
1214
1215Boundaries of Hangul syllables are determined
1216according to conjoining Jamo behavior in F<the Unicode Standard>
1217and F<HangulSyllableType.txt>.
1218
1219B<Implementation Note:>
1220(1) For expansion mapping (Unicode character mapped
1221to a sequence of collation elements), a terminator will not be added
1222between collation elements, even if Hangul syllable boundary exists there.
1223Addition of terminator is restricted to the next position
1224to the last collation element.
1225
1226(2) Non-conjoining Hangul letters
1227(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1228automatically terminated with a terminator primary weight.
1229These characters may need terminator included in a collation element
1230table beforehand.
1231
6d3c12b5
CBW
1232=item highestFFFF
1233
1234-- see 5.14 Collation Elements, UTS #35.
1235
1236If the parameter is made true, C<U+FFFF> has a highest primary weight.
1237When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1238C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1239begins with C<"abc">, or another primary equivalent.
1240C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1241such as C<"abc\x{FFFF}xyz">.
1242
1243C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
ed2081ad 1244almost, but the latter has a problem that you should know which letter is
6d3c12b5 1245next to C<c>. For a certain language where C<ch> as the next letter,
ed2081ad 1246C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
6d3c12b5 1247
60f577e0
SH
1248Note:
1249This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
f443a335 1250Any other character than C<U+FFFF> can be tailored by C<entry>.
6d3c12b5 1251
750da838
CBW
1252=item identical
1253
1254-- see A.3 Deterministic Comparison, UTS #10.
1255
1256By default, strings whose weights are equal should be equal,
1257even though their code points are not equal.
5c77070f 1258Completely ignorable characters are ignored.
750da838
CBW
1259
1260If the parameter is made true, a final, tie-breaking level is used.
5c77070f
CBW
1261If no difference of weights is found after the comparison through
1262all the level specified by C<level>, the comparison with code points
1263will be performed.
ed2081ad 1264For the tie-breaking comparison, the sort key has code points
5c77070f
CBW
1265of the original string appended.
1266Completely ignorable characters are not ignored.
750da838
CBW
1267
1268If C<preprocess> and/or C<normalization> is applied, the code points
1269of the string after them (in NFD by default) are used.
1270
45394607
JH
1271=item ignoreChar
1272
3756e7ca
RGS
1273=item ignoreName
1274
f8187d97 1275-- see 3.6 Variable Weighting, UTS #10.
45394607 1276
caffd4cf
TS
1277Makes the entry in the table completely ignorable;
1278i.e. as if the weights were zero at all level.
45394607 1279
3756e7ca
RGS
1280Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1281will be ignored. Through C<ignoreName>, any character whose name
1282(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1283will be ignored.
1284
a7fbee98 1285E.g. when 'a' and 'e' are ignorable,
45394607
JH
1286'element' is equal to 'lament' (or 'lmnt').
1287
19265284
CBW
1288=item ignore_level2
1289
1290-- see 5.1 Parametric Tailoring, UTS #10.
1291
1292By default, case-sensitive comparison (that is level 3 difference)
1293won't ignore accents (that is level 2 difference).
1294
1295If the parameter is made true, accents (and other primary ignorable
1296characters) are ignored, even though cases are taken into account.
1297
1298B<NOTE>: C<level> should be 3 or greater.
1299
3756e7ca
RGS
1300=item katakana_before_hiragana
1301
750da838 1302-- see 7.2 Tertiary Weight Table, UTS #10.
3756e7ca
RGS
1303
1304By default, hiragana is before katakana.
1305If the parameter is made true, this is reversed.
1306
1307B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1308distinctions must occur in level 3, and their weights at level 3 must be
1309same as those mentioned in 7.3.1, UTS #10.
1310If you define your collation elements which violate this requirement,
1311this parameter does not work validly.
1312
45394607
JH
1313=item level
1314
3756e7ca 1315-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1316
1317Set the maximum level.
1318Any higher levels than the specified one are ignored.
1319
1320 Level 1: alphabetic ordering
1321 Level 2: diacritic ordering
1322 Level 3: case ordering
91ae00cb 1323 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607
JH
1324
1325 ex.level => 2,
1326
a7fbee98
JH
1327If omitted, the maximum is the 4th.
1328
750da838
CBW
1329B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1330But this module only uses weights within 0xFFFF.
1331When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1332and 'shift-trimmed'), the level 4 may be unreliable.
1333
1334See also C<identical>.
1335
f8187d97
SH
1336=item long_contraction
1337
1338-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
1339
1340If the parameter is made true, for a contraction with three or more
1341characters (here nicknamed "long contraction"), initial substrings
1342will be handled.
1343For example, a contraction ABC, where A is a starter, and B and C
1344are non-starters (character with non-zero combining character class),
1345will be detected even if there is not AB as a contraction.
1346
1347B<Default:> Usually false.
1348If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
1349is not specified in C<new()>, a true value is set implicitly.
1350This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
1351
1352C<change()> handles C<long_contraction> explicitly only.
1353If C<long_contraction> is not specified in C<change()>, even though
1354C<UCA_Version> is changed, C<long_contraction> will not be changed.
1355
1356B<Limitation:> Scanning non-starters is one-way (no back tracking).
1357If AB is found but not ABC is not found, other long contraction where
1358the first character is A and the second is not B may not be found.
1359
1360Under C<(normalization =E<gt> undef)>, detection step of discontiguous
c28567dd 1361contractions will be skipped.
f8187d97
SH
1362
1363B<Note:> The following contractions in DUCET are not considered
1364in steps S2.1.1 to S2.1.3, where they are discontiguous.
1365
1366 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
1367 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
1368
1369For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
1370(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
1371In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
1372instead of C<0FB2 0F71 0F80>.
1373Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
1374contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
1375
6d3c12b5
CBW
1376=item minimalFFFE
1377
1378-- see 5.14 Collation Elements, UTS #35.
1379
1380If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1381The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1382first compares C<$a1> and C<$b1> at level 1, and
1383then C<$a2> and C<$b2> at level 1, as followed.
1384
1385 "ab\x{FFFE}a"
1386 "Ab\x{FFFE}a"
1387 "ab\x{FFFE}c"
1388 "Ab\x{FFFE}c"
1389 "ab\x{FFFE}xyz"
1390 "abc\x{FFFE}def"
1391 "abc\x{FFFE}xYz"
1392 "aBc\x{FFFE}xyz"
1393 "abcX\x{FFFE}def"
1394 "abcx\x{FFFE}xyz"
1395 "b\x{FFFE}aaa"
1396 "bbb\x{FFFE}a"
1397
60f577e0
SH
1398Note:
1399This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
f443a335 1400Any other character than C<U+FFFE> can be tailored by C<entry>.
6d3c12b5 1401
45394607
JH
1402=item normalization
1403
3756e7ca 1404-- see 4.1 Normalize, UTS #10.
45394607 1405
905aa9f0 1406If specified, strings are normalized before preparation of sort keys
45394607
JH
1407(the normalization is executed after preprocess).
1408
1d2654e1
JH
1409A form name C<Unicode::Normalize::normalize()> accepts will be applied
1410as C<$normalization_form>.
06c8fc8f 1411Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1
JH
1412See C<Unicode::Normalize::normalize()> for detail.
1413If omitted, C<'NFD'> is used.
45394607 1414
91ae00cb 1415C<normalization> is performed after C<preprocess> (if defined).
45394607 1416
06c8fc8f
RGS
1417Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1418though they are not concerned with C<Unicode::Normalize::normalize()>.
1419
1420If C<undef> (not a string C<"undef">) is passed explicitly
1421as the value for this key,
45394607 1422any normalization is not carried out (this may make tailoring easier
abd1ec54
NC
1423if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1424only contiguous contractions are resolved;
1425e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1426C<A-cedilla-ring> would be primary equal to C<A>.
06c8fc8f
RGS
1427In this point,
1428C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1429B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1430
1431In the case of C<(normalization =E<gt> "prenormalized")>,
1432any normalization is not performed, but
b5d9a953 1433discontiguous contractions with combining characters are performed.
06c8fc8f
RGS
1434Therefore
1435C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1436B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1437If source strings are finely prenormalized,
1438C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1439
1440Except C<(normalization =E<gt> undef)>,
1441B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607
JH
1442
1443=item overrideCJK
1444
4d36a948 1445-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1446
584e761d 1447By default, CJK unified ideographs are ordered in Unicode codepoint
ed2081ad 1448order, but those in the CJK Unified Ideographs block are less than
584e761d 1449those in the CJK Unified Ideographs Extension A etc.
00e00351 1450
b5d9a953 1451 In the CJK Unified Ideographs block:
cba8842c
A
1452 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1453 U+4E00..U+9FBB if UCA_Version is 14 or 16.
b5d9a953 1454 U+4E00..U+9FC3 if UCA_Version is 18.
cba8842c 1455 U+4E00..U+9FCB if UCA_Version is 20 or 22.
fcd4e2f8
SH
1456 U+4E00..U+9FCC if UCA_Version is 24 to 30.
1457 U+4E00..U+9FD5 if UCA_Version is 32.
00e00351 1458
b5d9a953
CBW
1459 In the CJK Unified Ideographs Extension blocks:
1460 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
2b488730
CBW
1461 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
1462 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
fcd4e2f8 1463 Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32.
91ae00cb 1464
584e761d 1465Through C<overrideCJK>, ordering of CJK unified ideographs (including
f58b9ef1 1466extensions) can be overridden.
45394607 1467
584e761d 1468ex. CJK unified ideographs in the JIS code point order.
45394607
JH
1469
1470 overrideCJK => sub {
a7fbee98
JH
1471 my $u = shift; # get a Unicode codepoint
1472 my $b = pack('n', $u); # to UTF-16BE
1473 my $s = your_unicode_to_sjis_converter($b); # convert
1474 my $n = unpack('n', $s); # convert sjis to short
1475 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607
JH
1476 },
1477
539ce3d8
CBW
1478The return value may be an arrayref of 1st to 4th weights as shown
1479above. The return value may be an integer as the primary weight
1480as shown below. If C<undef> is returned, the default derived
1481collation element will be used.
1482
1483 overrideCJK => sub {
1484 my $u = shift; # get a Unicode codepoint
1485 my $b = pack('n', $u); # to UTF-16BE
1486 my $s = your_unicode_to_sjis_converter($b); # convert
1487 my $n = unpack('n', $s); # convert sjis to short
1488 return $n; # return the primary weight
1489 },
1490
1491The return value may be a list containing zero or more of
1492an arrayref, an integer, or C<undef>.
1493
584e761d 1494ex. ignores all CJK unified ideographs.
a7fbee98
JH
1495
1496 overrideCJK => sub {()}, # CODEREF returning empty list
1497
1498 # where ->eq("Pe\x{4E00}rl", "Perl") is true
584e761d 1499 # as U+4E00 is a CJK unified ideograph and to be ignorable.
a7fbee98 1500
60f577e0
SH
1501If a false value (including C<undef>) is passed, C<overrideCJK>
1502has no effect.
1503C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
1504
584e761d
CBW
1505But assignment of weight for CJK unified ideographs
1506in C<table> or C<entry> is still valid.
60f577e0
SH
1507If C<undef> is passed explicitly as the value for this key,
1508weights for CJK unified ideographs are treated as undefined.
1509However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
1510has no special meaning.
584e761d
CBW
1511
1512B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1513C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1514C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
f58b9ef1 1515ideographs. But they can't be overridden via C<overrideCJK> when you use
584e761d
CBW
1516DUCET, as the table includes weights for them. C<table> or C<entry> has
1517priority over C<overrideCJK>.
a7fbee98
JH
1518
1519=item overrideHangul
1520
4d36a948 1521-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1522
584e761d 1523By default, Hangul syllables are decomposed into Hangul Jamo,
abd1ec54 1524even if C<(normalization =E<gt> undef)>.
f58b9ef1 1525But the mapping of Hangul syllables may be overridden.
a7fbee98 1526
3756e7ca 1527This parameter works like C<overrideCJK>, so see there for examples.
a7fbee98 1528
584e761d
CBW
1529If you want to override the mapping of Hangul syllables,
1530NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1531Hangul syllables before overriding. FCD may decompose Hangul syllables
1532as the case may be.
45394607 1533
60f577e0
SH
1534If a false value (but not C<undef>) is passed, C<overrideHangul>
1535has no effect.
1536C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
1537
a7fbee98 1538If C<undef> is passed explicitly as the value for this key,
584e761d 1539weight for Hangul syllables is treated as undefined
a7fbee98 1540without decomposition into Hangul Jamo.
584e761d
CBW
1541But definition of weight for Hangul syllables
1542in C<table> or C<entry> is still valid.
a7fbee98 1543
60f577e0
SH
1544=item overrideOut
1545
1546-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
1547
1548Perl seems to allow out-of-range values (greater than 0x10FFFF).
1549By default, out-of-range values are replaced with C<U+FFFD>
1550(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
1551or ignored when C<UCA_Version> E<lt>= 20.
1552
1553When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
1554can be overridden. Though C<table> or C<entry> are available for them,
1555out-of-range values are too many.
1556
1557C<overrideOut> can perform it algorithmically.
1558This parameter works like C<overrideCJK>, so see there for examples.
1559
1560ex. ignores all out-of-range values.
1561
1562 overrideOut => sub {()}, # CODEREF returning empty list
1563
1564If a false value (including C<undef>) is passed, C<overrideOut>
1565has no effect.
1566C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
1567
d8e4b4ea
CBW
1568B<NOTE ABOUT U+FFFD:>
1569
60f577e0
SH
1570UCA recommends that out-of-range values should not be ignored for security
1571reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
1572However, C<U+FFFD> is wrongly mapped to a variable collation element
1573in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
1574ignored when C<variable> isn't C<Non-ignorable>.
1575
d8e4b4ea
CBW
1576The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
1577see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
1578(7.1.4 Trailing Weights). Such a correction is reproduced by this.
60f577e0
SH
1579
1580 overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
1581
d8e4b4ea 1582This workaround is unnecessary since Unicode 6.3.0.
60f577e0 1583
45394607
JH
1584=item preprocess
1585
750da838 1586-- see 5.4 Preprocessing, UTS #10.
45394607 1587
19265284 1588If specified, the coderef is used to preprocess each string
45394607
JH
1589before the formation of sort keys.
1590
a7fbee98 1591ex. dropping English articles, such as "a" or "the".
45394607
JH
1592Then, "the pen" is before "a pencil".
1593
1594 preprocess => sub {
1595 my $str = shift;
a7fbee98 1596 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1597 return $str;
45394607
JH
1598 },
1599
91ae00cb 1600C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1601
539ce3d8
CBW
1602ex. decoding strings in a legacy encoding such as shift-jis:
1603
1604 $sjis_collator = Unicode::Collate->new(
1605 preprocess => \&your_shiftjis_to_unicode_decoder,
1606 );
1607 @result = $sjis_collator->sort(@shiftjis_strings);
1608
1609B<Note:> Strings returned from the coderef will be interpreted
1610according to Perl's Unicode support. See L<perlunicode>,
1611L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1612
45394607
JH
1613=item rearrange
1614
750da838 1615-- see 3.5 Rearrangement, UTS #10.
45394607
JH
1616
1617Characters that are not coded in logical order and to be rearranged.
ed2081ad 1618If C<UCA_Version> is equal to or less than 11, default is:
45394607
JH
1619
1620 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1621
3756e7ca
RGS
1622If you want to disallow any rearrangement, pass C<undef> or C<[]>
1623(a reference to empty list) as the value for this key.
1624
74b94a79
CBW
1625If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1626(i.e. no rearrangement).
a7fbee98 1627
0116f5dc
JH
1628B<According to the version 9 of UCA, this parameter shall not be used;
1629but it is not warned at present.>
1630
19265284
CBW
1631=item rewrite
1632
1633If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1634The coderef will get each line, and then should return a rewritten line
1635according to the UCA file format.
1636If the coderef returns an empty line, the line will be skipped.
1637
1638e.g. any primary ignorable characters into tertiary ignorable:
1639
1640 rewrite => sub {
1641 my $line = shift;
1642 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1643 return $line;
1644 },
1645
1646This example shows rewriting weights. C<rewrite> is allowed to
1647affect code points, weights, and the name.
1648
1649B<NOTE>: C<table> is available to use another table file;
1650preparing a modified table once would be more efficient than
1651rewriting lines on reading an unmodified table every time.
1652
aa7758f7
CBW
1653=item suppress
1654
1655-- see suppress contractions in 5.14.11 Special-Purpose Commands,
1656UTS #35 (LDML).
1657
539ce3d8 1658Contractions beginning with the specified characters are suppressed,
bd65daab 1659even if those contractions are defined in C<table>.
aa7758f7
CBW
1660
1661An example for Russian and some languages using the Cyrillic script:
1662
1663 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1664
1665where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1666
fcd4e2f8 1667B<NOTE>: Contractions via C<entry> will not be suppressed.
bd65daab 1668
45394607
JH
1669=item table
1670
f8187d97 1671-- see 3.8 Default Unicode Collation Element Table, UTS #10.
45394607 1672
91ae00cb 1673You can use another collation element table if desired.
45394607 1674
e7f779c8 1675The table file should locate in the F<Unicode/Collate> directory
3756e7ca
RGS
1676on C<@INC>. Say, if the filename is F<Foo.txt>,
1677the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
e7f779c8
RGS
1678
1679By default, F<allkeys.txt> (as the filename of DUCET) is used.
0d50d293
RGS
1680If you will prepare your own table file, any name other than F<allkeys.txt>
1681may be better to avoid namespace conflict.
45394607 1682
00e00351
CBW
1683B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1684module, and it may save time at the run time.
60f577e0 1685Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
19265284
CBW
1686or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1687C<rewrite> will prevent this module from using the compiled DUCET.
00e00351 1688
a7fbee98 1689If C<undef> is passed explicitly as the value for this key,
91ae00cb 1690no file is read (but you can define collation elements via C<entry>).
a7fbee98
JH
1691
1692A typical way to define a collation element table
1693without any file of table:
1694
1695 $onlyABC = Unicode::Collate->new(
1696 table => undef,
1697 entry => << 'ENTRIES',
16980061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
16990041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
17000062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
17010042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
17020063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
17030043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1704ENTRIES
1705 );
905aa9f0 1706
3756e7ca
RGS
1707If C<ignoreName> or C<undefName> is used, character names should be
1708specified as a comment (following C<#>) on each line.
45394607
JH
1709
1710=item undefChar
1711
3756e7ca
RGS
1712=item undefName
1713
4d36a948 1714-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1715
584e761d 1716Undefines the collation element as if it were unassigned in the C<table>.
45394607
JH
1717This reduces the size of the table.
1718If an unassigned character appears in the string to be collated,
1719the sort key is made from its codepoint
1720as a single-character collation element,
1721as it is greater than any other assigned collation elements
1722(in the codepoint order among the unassigned characters).
1723But, it'd be better to ignore characters
1724unfamiliar to you and maybe never used.
1725
3756e7ca
RGS
1726Through C<undefChar>, any character matching C<qr/$undefChar/>
1727will be undefined. Through C<undefName>, any character whose name
1728(given in the C<table> file as a comment) matches C<qr/$undefName/>
1729will be undefined.
1730
e7f779c8
RGS
1731ex. Collation weights for beyond-BMP characters are not stored in object:
1732
1733 undefChar => qr/[^\0-\x{fffd}]/,
1734
45394607
JH
1735=item upper_before_lower
1736
3756e7ca 1737-- see 6.6 Case Comparisons, UTS #10.
45394607 1738
3756e7ca
RGS
1739By default, lowercase is before uppercase.
1740If the parameter is made true, this is reversed.
45394607 1741
3756e7ca
RGS
1742B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1743distinctions must occur in level 3, and their weights at level 3 must be
1744same as those mentioned in 7.3.1, UTS #10.
1745If you define your collation elements which differs from this requirement,
1746this parameter doesn't work validly.
45394607 1747
91ae00cb
NC
1748=item variable
1749
f8187d97 1750-- see 3.6 Variable Weighting, UTS #10.
91ae00cb 1751
bd65daab 1752This key allows for variable weighting of variable collation elements,
91ae00cb 1753which are marked with an ASTERISK in the table
f58b9ef1 1754(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
91ae00cb
NC
1755
1756 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1757
1758These names are case-insensitive.
1759By default (if specification is omitted), 'shifted' is adopted.
1760
1761 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1762 considered at the 4th level.
1763
abd1ec54 1764 'Non-Ignorable' Variable elements are not reset to ignorable.
91ae00cb
NC
1765
1766 'Shifted' Variable elements are made ignorable at levels 1 through 3
1767 their level 4 weight is replaced by the old level 1 weight.
1768 Level 4 weight for Non-Variable elements is 0xFFFF.
1769
1770 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1771 are trimmed.
1772
45394607
JH
1773=back
1774
3164dd77 1775=head2 Methods for Collation
45394607
JH
1776
1777=over 4
1778
5398038e 1779=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607
JH
1780
1781Sorts a list of strings.
1782
5398038e 1783=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607
JH
1784
1785Returns 1 (when C<$a> is greater than C<$b>)
1786or 0 (when C<$a> is equal to C<$b>)
ed2081ad 1787or -1 (when C<$a> is less than C<$b>).
45394607 1788
5398038e
TS
1789=item C<$result = $Collator-E<gt>eq($a, $b)>
1790
1791=item C<$result = $Collator-E<gt>ne($a, $b)>
1792
1793=item C<$result = $Collator-E<gt>lt($a, $b)>
1794
1795=item C<$result = $Collator-E<gt>le($a, $b)>
1796
1797=item C<$result = $Collator-E<gt>gt($a, $b)>
1798
1799=item C<$result = $Collator-E<gt>ge($a, $b)>
1800
a7fbee98 1801They works like the same name operators as theirs.
5398038e
TS
1802
1803 eq : whether $a is equal to $b.
1804 ne : whether $a is not equal to $b.
ed2081ad
CBW
1805 lt : whether $a is less than $b.
1806 le : whether $a is less than $b or equal to $b.
5398038e
TS
1807 gt : whether $a is greater than $b.
1808 ge : whether $a is greater than $b or equal to $b.
1809
1810=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1811
3756e7ca 1812-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1813
1814Returns a sort key.
1815
1816You compare the sort keys using a binary comparison
1817and get the result of the comparison of the strings using UCA.
1818
5398038e 1819 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607
JH
1820
1821 is equivalent to
1822
5398038e 1823 $Collator->cmp($a, $b)
45394607 1824
a7fbee98
JH
1825=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1826
3756e7ca
RGS
1827Converts a sorting key into its representation form.
1828If C<UCA_Version> is 8, the output is slightly different.
1829
a7fbee98
JH
1830 use Unicode::Collate;
1831 my $c = Unicode::Collate->new();
1832 print $c->viewSortKey("Perl"),"\n";
1833
0116f5dc
JH
1834 # output:
1835 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1836 # Level 1 Level 2 Level 3 Level 4
1837
4d36a948
TS
1838=back
1839
1840=head2 Methods for Searching
d16e9e3d 1841
4d36a948
TS
1842The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1843like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1844but they are not aware of any pattern, but only a literal substring.
1845
19265284
CBW
1846B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1847for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1848C<subst>, C<gsubst>) is croaked, as the position and the length might
1849differ from those on the specified string.
1850
1851C<rearrange> and C<hangul_terminator> parameters are neglected.
1852C<katakana_before_hiragana> and C<upper_before_lower> don't affect
ed2081ad 1853matching and searching, as it doesn't matter whether greater or less.
19265284 1854
4d36a948
TS
1855=over 4
1856
1857=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1858
1859=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d
JH
1860
1861If C<$substring> matches a part of C<$string>, returns
1862the position of the first occurrence of the matching part in scalar context;
1863in list context, returns a two-element list of
1864the position and the length of the matching part.
1865
d16e9e3d
JH
1866If C<$substring> does not match any part of C<$string>,
1867returns C<-1> in scalar context and
1868an empty list in list context.
1869
8c1c815e
CBW
1870e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
1871you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
d16e9e3d 1872
5398038e 1873 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1874 # (normalization => undef) is REQUIRED.
d16e9e3d 1875 my $match;
a7fbee98 1876 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1877 $match = substr($str, $pos, $len);
d16e9e3d
JH
1878 }
1879
8c1c815e
CBW
1880and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
1881is primary equal to C<"M>E<uuml>C<SS">.
4d36a948
TS
1882
1883=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1884
1885=item C<($match) = $Collator-E<gt>match($string, $substring)>
1886
1887If C<$substring> matches a part of C<$string>, in scalar context, returns
1888B<a reference to> the first occurrence of the matching part
1889(C<$match_ref> is always true if matches,
1890since every reference is B<true>);
1891in list context, returns the first occurrence of the matching part.
1892
1893If C<$substring> does not match any part of C<$string>,
1894returns C<undef> in scalar context and
1895an empty list in list context.
1896
1897e.g.
1898
1899 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1900 print "matches [$$match_ref].\n";
1901 } else {
1902 print "doesn't match.\n";
1903 }
1904
3756e7ca 1905 or
4d36a948
TS
1906
1907 if (($match) = $Collator->match($str, $sub)) { # list context
1908 print "matches [$match].\n";
1909 } else {
1910 print "doesn't match.\n";
1911 }
1912
1913=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1914
1915If C<$substring> matches a part of C<$string>, returns
1916all the matching parts (or matching count in scalar context).
1917
1918If C<$substring> does not match any part of C<$string>,
1919returns an empty list.
1920
1921=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1922
1923If C<$substring> matches a part of C<$string>,
1924the first occurrence of the matching part is replaced by C<$replacement>
19265284 1925(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
4d36a948
TS
1926
1927C<$replacement> can be a C<CODEREF>,
1928taking the matching part as an argument,
1929and returning a string to replace the matching part
1930(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1931
1932=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1933
1934If C<$substring> matches a part of C<$string>,
19265284
CBW
1935all the occurrences of the matching part are replaced by C<$replacement>
1936(C<$string> is modified) and C<$count> is returned.
4d36a948
TS
1937
1938C<$replacement> can be a C<CODEREF>,
1939taking the matching part as an argument,
1940and returning a string to replace the matching part
1941(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1942
1943e.g.
1944
1945 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1946 # (normalization => undef) is REQUIRED.
19265284 1947 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
4d36a948
TS
1948 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1949
19265284 1950 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
4d36a948 1951 # i.e., all the camels are made bold-faced.
d16e9e3d 1952
19265284
CBW
1953 Examples: levels and ignore_level2 - what does camel match?
1954 ---------------------------------------------------------------------------
1955 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l
1956 -----------------------|---------------------------------------------------
1957 1 false | yes yes yes yes yes
1958 2 false | yes yes no yes yes
1959 3 false | yes no no yes yes
1960 4 false | yes no no no yes
1961 -----------------------|---------------------------------------------------
1962 1 true | yes yes yes yes yes
1963 2 true | yes yes yes yes yes
1964 3 true | yes no yes yes yes
1965 4 true | yes no yes no yes
1966 ---------------------------------------------------------------------------
1967 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1968 at any level.
1969
45394607
JH
1970=back
1971
3164dd77
TS
1972=head2 Other Methods
1973
1974=over 4
1975
0116f5dc
JH
1976=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1977
68adb2b0
CBW
1978=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1979
19265284 1980Changes the value of specified keys and returns the changed part.
0116f5dc
JH
1981
1982 $Collator = Unicode::Collate->new(level => 4);
1983
1984 $Collator->eq("perl", "PERL"); # false
1985
1986 %old = $Collator->change(level => 2); # returns (level => 4).
1987
1988 $Collator->eq("perl", "PERL"); # true
1989
1990 $Collator->change(%old); # returns (level => 2).
1991
1992 $Collator->eq("perl", "PERL"); # false
1993
1994Not all C<(key,value)>s are allowed to be changed.
1995See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1996
1997In the scalar context, returns the modified collator
1998(but it is B<not> a clone from the original).
1999
2000 $Collator->change(level => 2)->eq("perl", "PERL"); # true
2001
2002 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
2003
2004 $Collator->change(level => 4)->eq("perl", "PERL"); # false
2005
91ae00cb 2006=item C<$version = $Collator-E<gt>version()>
3164dd77 2007
91ae00cb
NC
2008Returns the version number (a string) of the Unicode Standard
2009which the C<table> file used by the collator object is based on.
2010If the table does not include a version line (starting with C<@version>),
2011returns C<"unknown">.
2012
2013=item C<UCA_Version()>
3164dd77 2014
7b98b857
CBW
2015Returns the revision number of UTS #10 this module consults,
2016that should correspond with the DUCET incorporated.
3164dd77 2017
91ae00cb
NC
2018=item C<Base_Unicode_Version()>
2019
7b98b857
CBW
2020Returns the version number of UTS #10 this module consults,
2021that should correspond with the DUCET incorporated.
3164dd77
TS
2022
2023=back
2024
3756e7ca
RGS
2025=head1 EXPORT
2026
2027No method will be exported.
45394607 2028
0d50d293
RGS
2029=head1 INSTALL
2030
2031Though this module can be used without any C<table> file,
2032to use this module easily, it is recommended to install a table file
2033in the UCA format, by copying it under the directory
2034<a place in @INC>/Unicode/Collate.
2035
6d24ed10
SP
2036The most preferable one is "The Default Unicode Collation Element Table"
2037(aka DUCET), available from the Unicode Consortium's website:
0d50d293
RGS
2038
2039 http://www.unicode.org/Public/UCA/
2040
2041 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
2042
2043If DUCET is not installed, it is recommended to copy the file
2044from http://www.unicode.org/Public/UCA/latest/allkeys.txt
2045to <a place in @INC>/Unicode/Collate/allkeys.txt
2046manually.
2047
3756e7ca
RGS
2048=head1 CAVEATS
2049
2050=over 4
45394607 2051
3756e7ca 2052=item Normalization
45394607 2053
3756e7ca
RGS
2054Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2055module (see L<Unicode::Normalize>).
45394607 2056
5398038e 2057If you need not it (say, in the case when you need not
45394607 2058handle any combining characters),
60f577e0 2059assign C<(normalization =E<gt> undef)> explicitly.
45394607 2060
4d36a948 2061-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 2062
3756e7ca 2063=item Conformance Test
0116f5dc 2064
10d7ec48
NC
2065The Conformance Test for the UCA is available
2066under L<http://www.unicode.org/Public/UCA/>.
0116f5dc
JH
2067
2068For F<CollationTest_SHIFTED.txt>,
2069a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2070for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 2071C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 2072
750da838
CBW
2073If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
2074C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
2075C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
2076C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
2077
4d36a948 2078B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 2079
3756e7ca
RGS
2080=back
2081
0d50d293 2082=head1 AUTHOR, COPYRIGHT AND LICENSE
45394607 2083
0d50d293 2084The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
fcd4e2f8 2085<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2016,
0d50d293 2086SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 2087
0d50d293
RGS
2088This module is free software; you can redistribute it and/or
2089modify it under the same terms as Perl itself.
45394607 2090
00e00351 2091The file Unicode/Collate/allkeys.txt was copied verbatim
fcd4e2f8
SH
2092from L<http://www.unicode.org/Public/UCA/8.0.0/allkeys.txt>.
2093For this file, Copyright (c) 2001-2015 Unicode, Inc.; distributed
2094under the Terms of Use in L<http://www.unicode.org/terms_of_use.html>
45394607
JH
2095
2096=head1 SEE ALSO
2097
2098=over 4
2099
91ae00cb
NC
2100=item Unicode Collation Algorithm - UTS #10
2101
2102L<http://www.unicode.org/reports/tr10/>
2103
2104=item The Default Unicode Collation Element Table (DUCET)
2105
10d7ec48 2106L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 2107
91ae00cb 2108=item The conformance test for the UCA
45394607 2109
10d7ec48 2110L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 2111
10d7ec48 2112L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 2113
91ae00cb 2114=item Hangul Syllable Type
0116f5dc 2115
10d7ec48 2116L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 2117
91ae00cb 2118=item Unicode Normalization Forms - UAX #15
a7fbee98 2119
91ae00cb 2120L<http://www.unicode.org/reports/tr15/>
a7fbee98 2121
aa7758f7
CBW
2122=item Unicode Locale Data Markup Language (LDML) - UTS #35
2123
2124L<http://www.unicode.org/reports/tr35/>
2125
45394607
JH
2126=back
2127
2128=cut