This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Sync with CPAN version 5.00 of Term::ANSIColor
[perl5.git] / cpan / Unicode-Collate / Collate.pm
CommitLineData
45394607
JH
1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
d8e4b4ea
CBW
7 unless (0x41 == unpack('U', 'A')) {
8 die "Unicode::Collate cannot get a Unicode code point\n";
9 }
4a2e806c
JH
10}
11
45394607
JH
12use 5.006;
13use strict;
14use warnings;
15use Carp;
e69a2255 16use File::Spec;
5398038e 17
10d7ec48
NC
18no warnings 'utf8';
19
6330ea69 20our $VERSION = '1.27';
45394607
JH
21our $PACKAGE = __PACKAGE__;
22
19265284 23### begin XS only ###
f7a188d2
SH
24use XSLoader ();
25XSLoader::load('Unicode::Collate', $VERSION);
19265284 26### end XS only ###
f58b9ef1 27
e7f779c8 28my @Path = qw(Unicode Collate);
f7a188d2 29my $KeyFile = 'allkeys.txt';
45394607 30
4d36a948
TS
31# Perl's boolean
32use constant TRUE => 1;
33use constant FALSE => "";
34use constant NOMATCHPOS => -1;
35
36# A coderef to get combining class imported from Unicode::Normalize
37# (i.e. \&Unicode::Normalize::getCombinClass).
38# This is also used as a HAS_UNICODE_NORMALIZE flag.
e7f779c8 39my $CVgetCombinClass;
4d36a948 40
9f1f04a1
RGS
41# Supported Levels
42use constant MinLevel => 1;
43use constant MaxLevel => 4;
44
4d36a948 45# Minimum weights at level 2 and 3, respectively
9f1f04a1
RGS
46use constant Min2Wt => 0x20;
47use constant Min3Wt => 0x02;
4d36a948
TS
48
49# Shifted weight at 4th level
9f1f04a1 50use constant Shift4Wt => 0xFFFF;
4d36a948 51
4d36a948 52# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
4d36a948
TS
53use constant VCE_TEMPLATE => 'Cn4';
54
4d36a948 55# A sort key: 16-bit weights
4d36a948
TS
56use constant KEY_TEMPLATE => 'n*';
57
750da838
CBW
58# The tie-breaking: 32-bit weights
59use constant TIE_TEMPLATE => 'N*';
60
4d36a948
TS
61# Level separator in a sort key:
62# i.e. pack(KEY_TEMPLATE, 0)
63use constant LEVEL_SEP => "\0\0";
64
65# As Unicode code point separator for hash keys.
66# A joined code point string (denoted by JCPS below)
67# like "65;768" is used for internal processing
68# instead of Perl's Unicode string like "\x41\x{300}",
69# as the native code point is different from the Unicode code point
70# on EBCDIC platform.
71# This character must not be included in any stringified
72# representation of an integer.
73use constant CODE_SEP => ';';
19265284 74 # NOTE: in regex /;/ is used for $jcps!
4d36a948
TS
75
76# boolean values of variable weights
0116f5dc
JH
77use constant NON_VAR => 0; # Non-Variable character
78use constant VAR => 1; # Variable character
3164dd77 79
91ae00cb 80# specific code points
00e00351
CBW
81use constant Hangul_SIni => 0xAC00;
82use constant Hangul_SFin => 0xD7A3;
584e761d 83
4d36a948 84# Logical_Order_Exception in PropList.txt
e7f779c8 85my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 86
6d3c12b5
CBW
87# for highestFFFF and minimalFFFE
88my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
89my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE);
90
6330ea69 91sub UCA_Version { '36' }
a7fbee98 92
6330ea69 93sub Base_Unicode_Version { '10.0.0' }
a7fbee98 94
9f1f04a1
RGS
95######
96
9f1f04a1 97sub pack_U {
ae6aa562 98 return pack('U*', @_);
9f1f04a1
RGS
99}
100
e9d2bd8a
CBW
101sub unpack_U {
102 return unpack('U*', shift(@_).pack('U*'));
103}
4779e03e 104
9f1f04a1
RGS
105######
106
91ae00cb
NC
107my (%VariableOK);
108@VariableOK{ qw/
0116f5dc 109 blanked non-ignorable shifted shift-trimmed
91ae00cb 110 / } = (); # keys lowercased
0116f5dc
JH
111
112our @ChangeOK = qw/
113 alternate backwards level normalization rearrange
19265284 114 katakana_before_hiragana upper_before_lower ignore_level2
60f577e0 115 overrideCJK overrideHangul overrideOut preprocess UCA_Version
6d3c12b5 116 hangul_terminator variable identical highestFFFF minimalFFFE
f8187d97 117 long_contraction
0116f5dc
JH
118 /;
119
120our @ChangeNG = qw/
b5d9a953 121 entry mapping table maxlength contraction
19265284
CBW
122 ignoreChar ignoreName undefChar undefName rewrite
123 versionTable alternateTable backwardsTable forwardsTable
124 rearrangeTable variableTable
00e00351 125 derivCode normCode rearrangeHash backwardsFlag
aa7758f7 126 suppress suppressHash
19265284 127 __useXS /; ### XS only
ed2081ad
CBW
128# The hash key 'ignored' was deleted at v 0.21.
129# The hash key 'isShift' was deleted at v 0.23.
130# The hash key 'combining' was deleted at v 0.24.
131# The hash key 'entries' was deleted at v 0.30.
132# The hash key 'L3_ignorable' was deleted at v 0.40.
91ae00cb
NC
133
134sub version {
135 my $self = shift;
136 return $self->{versionTable} || 'unknown';
137}
0116f5dc
JH
138
139my (%ChangeOK, %ChangeNG);
140@ChangeOK{ @ChangeOK } = ();
141@ChangeNG{ @ChangeNG } = ();
142
143sub change {
144 my $self = shift;
145 my %hash = @_;
146 my %old;
750da838
CBW
147 if (exists $hash{alternate}) {
148 if (exists $hash{variable}) {
149 delete $hash{alternate};
150 } else {
151 $hash{variable} = $hash{alternate};
152 }
91ae00cb 153 }
0116f5dc
JH
154 foreach my $k (keys %hash) {
155 if (exists $ChangeOK{$k}) {
156 $old{$k} = $self->{$k};
157 $self->{$k} = $hash{$k};
750da838 158 } elsif (exists $ChangeNG{$k}) {
0116f5dc
JH
159 croak "change of $k via change() is not allowed!";
160 }
161 # else => ignored
162 }
3756e7ca 163 $self->checkCollator();
0116f5dc
JH
164 return wantarray ? %old : $self;
165}
a7fbee98 166
9f1f04a1
RGS
167sub _checkLevel {
168 my $level = shift;
abd1ec54
NC
169 my $key = shift; # 'level' or 'backwards'
170 MinLevel <= $level or croak sprintf
171 "Illegal level %d (in value for key '%s') lower than %d.",
172 $level, $key, MinLevel;
173 $level <= MaxLevel or croak sprintf
174 "Unsupported level %d (in value for key '%s') higher than %d.",
175 $level, $key, MaxLevel;
9f1f04a1
RGS
176}
177
91ae00cb
NC
178my %DerivCode = (
179 8 => \&_derivCE_8,
180 9 => \&_derivCE_9,
181 11 => \&_derivCE_9, # 11 == 9
3756e7ca 182 14 => \&_derivCE_14,
74b94a79
CBW
183 16 => \&_derivCE_14, # 16 == 14
184 18 => \&_derivCE_18,
00e00351 185 20 => \&_derivCE_20,
b5d9a953 186 22 => \&_derivCE_22,
cba8842c 187 24 => \&_derivCE_24,
750da838 188 26 => \&_derivCE_24, # 26 == 24
2b488730 189 28 => \&_derivCE_24, # 28 == 24
c28567dd 190 30 => \&_derivCE_24, # 30 == 24
fcd4e2f8 191 32 => \&_derivCE_32,
6608d2d5 192 34 => \&_derivCE_34,
eaecde9a 193 36 => \&_derivCE_36,
91ae00cb
NC
194);
195
0116f5dc
JH
196sub checkCollator {
197 my $self = shift;
c7df802e 198 _checkLevel($self->{level}, 'level');
a7fbee98 199
91ae00cb
NC
200 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
201 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 202
91ae00cb 203 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 204 $self->{alternateTable} || 'shifted';
91ae00cb
NC
205 $self->{variable} = $self->{alternate} = lc($self->{variable});
206 exists $VariableOK{ $self->{variable} }
3756e7ca 207 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
0116f5dc 208
9f1f04a1
RGS
209 if (! defined $self->{backwards}) {
210 $self->{backwardsFlag} = 0;
750da838 211 } elsif (! ref $self->{backwards}) {
c7df802e 212 _checkLevel($self->{backwards}, 'backwards');
9f1f04a1 213 $self->{backwardsFlag} = 1 << $self->{backwards};
750da838 214 } else {
9f1f04a1
RGS
215 my %level;
216 $self->{backwardsFlag} = 0;
217 for my $b (@{ $self->{backwards} }) {
c7df802e 218 _checkLevel($b, 'backwards');
9f1f04a1
RGS
219 $level{$b} = 1;
220 }
221 for my $v (sort keys %level) {
222 $self->{backwardsFlag} += 1 << $v;
223 }
224 }
0116f5dc 225
91ae00cb
NC
226 defined $self->{rearrange} or $self->{rearrange} = [];
227 ref $self->{rearrange}
228 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc
JH
229
230 # keys of $self->{rearrangeHash} are $self->{rearrange}.
231 $self->{rearrangeHash} = undef;
232
233 if (@{ $self->{rearrange} }) {
234 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
235 }
236
237 $self->{normCode} = undef;
a7fbee98
JH
238
239 if (defined $self->{normalization}) {
240 eval { require Unicode::Normalize };
91ae00cb 241 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 242
91ae00cb 243 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 244
91ae00cb
NC
245 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
246 $self->{normCode} = \&Unicode::Normalize::NFD;
247 }
248 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f
RGS
249 my $norm = $self->{normalization};
250 $self->{normCode} = sub {
1d2654e1
JH
251 Unicode::Normalize::normalize($norm, shift);
252 };
06c8fc8f
RGS
253 eval { $self->{normCode}->("") }; # try
254 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 255 }
a7fbee98 256 }
0116f5dc
JH
257 return;
258}
259
260sub new
261{
262 my $class = shift;
263 my $self = bless { @_ }, $class;
45394607 264
19265284
CBW
265### begin XS only ###
266 if (! exists $self->{table} && !defined $self->{rewrite} &&
f58b9ef1
CBW
267 !defined $self->{undefName} && !defined $self->{ignoreName} &&
268 !defined $self->{undefChar} && !defined $self->{ignoreChar}) {
269 $self->{__useXS} = \&_fetch_simple;
19265284
CBW
270 } else {
271 $self->{__useXS} = undef;
272 }
273### end XS only ###
f58b9ef1 274
aa7758f7
CBW
275 # keys of $self->{suppressHash} are $self->{suppress}.
276 if ($self->{suppress} && @{ $self->{suppress} }) {
277 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
278 } # before read_table()
279
a7fbee98 280 # If undef is passed explicitly, no file is read.
0116f5dc 281 $self->{table} = $KeyFile if ! exists $self->{table};
3756e7ca 282 $self->read_table() if defined $self->{table};
905aa9f0 283
a7fbee98 284 if ($self->{entry}) {
e7f779c8 285 while ($self->{entry} =~ /([^\n]+)/g) {
bd65daab 286 $self->parseEntry($1, TRUE);
e7f779c8 287 }
a7fbee98 288 }
905aa9f0 289
f8187d97 290 # only in new(), not in change()
9f1f04a1 291 $self->{level} ||= MaxLevel;
0116f5dc 292 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 293
abd1ec54 294 $self->{overrideHangul} = FALSE
0116f5dc 295 if ! exists $self->{overrideHangul};
abd1ec54 296 $self->{overrideCJK} = FALSE
0116f5dc 297 if ! exists $self->{overrideCJK};
06c8fc8f 298 $self->{normalization} = 'NFD'
0116f5dc 299 if ! exists $self->{normalization};
3756e7ca
RGS
300 $self->{rearrange} = $self->{rearrangeTable} ||
301 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
a7fbee98 302 if ! exists $self->{rearrange};
0116f5dc
JH
303 $self->{backwards} = $self->{backwardsTable}
304 if ! exists $self->{backwards};
f8187d97
SH
305 exists $self->{long_contraction} or $self->{long_contraction}
306 = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
a7fbee98 307
f8187d97 308 # checkCollator() will be called in change()
3756e7ca 309 $self->checkCollator();
a7fbee98
JH
310
311 return $self;
312}
905aa9f0 313
00e00351
CBW
314sub parseAtmark {
315 my $self = shift;
316 my $line = shift; # after s/^\s*\@//
317
318 if ($line =~ /^version\s*(\S*)/) {
319 $self->{versionTable} ||= $1;
320 }
321 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
322 $self->{variableTable} ||= $1;
323 }
324 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
325 $self->{alternateTable} ||= $1;
326 }
327 elsif ($line =~ /^backwards\s+(\S*)/) {
328 push @{ $self->{backwardsTable} }, $1;
329 }
ed2081ad 330 elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
00e00351
CBW
331 push @{ $self->{forwardsTable} }, $1;
332 }
333 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
334 push @{ $self->{rearrangeTable} }, _getHexArray($1);
335 }
336}
337
905aa9f0 338sub read_table {
a7fbee98 339 my $self = shift;
a7fbee98 340
19265284 341### begin XS only ###
f58b9ef1
CBW
342 if ($self->{__useXS}) {
343 my @rest = _fetch_rest(); # complex matter need to parse
344 for my $line (@rest) {
345 next if $line =~ /^\s*#/;
346
347 if ($line =~ s/^\s*\@//) {
348 $self->parseAtmark($line);
349 } else {
350 $self->parseEntry($line);
351 }
352 }
353 return;
354 }
19265284 355### end XS only ###
f58b9ef1 356
e7f779c8
RGS
357 my($f, $fh);
358 foreach my $d (@INC) {
359 $f = File::Spec->catfile($d, @Path, $self->{table});
360 last if open($fh, $f);
361 $f = undef;
362 }
0d50d293
RGS
363 if (!defined $f) {
364 $f = File::Spec->catfile(@Path, $self->{table});
365 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
366 }
a7fbee98 367
6d24ed10
SP
368 while (my $line = <$fh>) {
369 next if $line =~ /^\s*#/;
abd1ec54 370
00e00351
CBW
371 if ($line =~ s/^\s*\@//) {
372 $self->parseAtmark($line);
373 } else {
374 $self->parseEntry($line);
abd1ec54 375 }
45394607 376 }
e7f779c8 377 close $fh;
45394607
JH
378}
379
905aa9f0 380
45394607
JH
381##
382## get $line, parse it, and write an entry in $self
383##
384sub parseEntry
385{
a7fbee98
JH
386 my $self = shift;
387 my $line = shift;
bd65daab 388 my $tailoring = shift;
4d36a948 389 my($name, $entry, @uv, @key);
a7fbee98 390
19265284
CBW
391 if (defined $self->{rewrite}) {
392 $line = $self->{rewrite}->($line);
393 }
394
a7fbee98
JH
395 return if $line !~ /^\s*[0-9A-Fa-f]/;
396
397 # removes comment and gets name
398 $name = $1
399 if $line =~ s/[#%]\s*(.*)//;
400 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
401
402 # gets element
403 my($e, $k) = split /;/, $line;
404 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
405 if ! $k;
406
4d36a948
TS
407 @uv = _getHexArray($e);
408 return if !@uv;
bd65daab 409 return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
aa7758f7 410 exists $self->{suppressHash}{$uv[0]};
4d36a948 411 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 412
4d36a948 413 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 414 my $ele = pack_U(@uv);
a7fbee98 415
ed2081ad 416 # regarded as if it were not stored in the table
4d36a948
TS
417 return
418 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 419
4d36a948
TS
420 # replaced as completely ignorable
421 $k = '[.0000.0000.0000.0000]'
422 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 423 }
0116f5dc 424
4d36a948
TS
425 # replaced as completely ignorable
426 $k = '[.0000.0000.0000.0000]'
427 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
428
4c843366 429 my $is_L3_ignorable = TRUE;
4d36a948 430
caffd4cf
TS
431 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
432 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948
TS
433 my @wt = _getHexArray($arr);
434 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 435 $is_L3_ignorable = FALSE
3756e7ca
RGS
436 if $wt[0] || $wt[1] || $wt[2];
437 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
438 # is completely ignorable.
4c843366
JH
439 # For expansion, an entry $is_L3_ignorable
440 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 441 }
caffd4cf 442
eaecde9a 443 # mapping: be an array ref or not exists (any false value is disallowed)
e7f779c8 444 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
caffd4cf 445
eaecde9a 446 # maxlength: be more than 1 or not exists (any false value is disallowed)
91ae00cb 447 if (@uv > 1) {
b5d9a953
CBW
448 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
449 $self->{maxlength}{$uv[0]} = @uv;
450 }
451 }
eaecde9a
CBW
452
453 # contraction: be 1 or not exists (any false value is disallowed)
f8187d97
SH
454 while (@uv > 2) {
455 pop @uv;
456 my $fake_entry = join(CODE_SEP, @uv); # in JCPS
457 $self->{contraction}{$fake_entry} = 1;
91ae00cb 458 }
45394607
JH
459}
460
9f1f04a1 461
45394607
JH
462sub viewSortKey
463{
a7fbee98 464 my $self = shift;
750da838
CBW
465 my $str = shift;
466 $self->visualizeSortKey($self->getSortKey($str));
9f1f04a1 467}
0116f5dc 468
d16e9e3d 469
750da838
CBW
470sub process
471{
472 my $self = shift;
473 my $str = shift;
474 my $prep = $self->{preprocess};
475 my $norm = $self->{normCode};
476
477 $str = &$prep($str) if ref $prep;
478 $str = &$norm($str) if ref $norm;
479 return $str;
480}
481
45394607 482##
91ae00cb 483## arrayref of JCPS = splitEnt(string to be collated)
750da838 484## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
45394607 485##
91ae00cb 486sub splitEnt
45394607 487{
a7fbee98 488 my $self = shift;
750da838
CBW
489 my $str = shift;
490 my $wLen = shift; # with Length
4d36a948 491
91ae00cb 492 my $map = $self->{mapping};
a7fbee98
JH
493 my $max = $self->{maxlength};
494 my $reH = $self->{rearrangeHash};
b5d9a953
CBW
495 my $vers = $self->{UCA_Version};
496 my $ver9 = $vers >= 9 && $vers <= 11;
f8187d97 497 my $long = $self->{long_contraction};
19265284 498 my $uXS = $self->{__useXS}; ### XS only
a7fbee98 499
750da838 500 my @buf;
a7fbee98 501
4d36a948 502 # get array of Unicode code point of string.
9f1f04a1 503 my @src = unpack_U($str);
4d36a948
TS
504
505 # rearrangement:
506 # Character positions are not kept if rearranged,
507 # then neglected if $wLen is true.
508 if ($reH && ! $wLen) {
a7fbee98
JH
509 for (my $i = 0; $i < @src; $i++) {
510 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
511 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
512 $i++;
513 }
514 }
45394607 515 }
45394607 516
3756e7ca 517 # remove a code point marked as a completely ignorable.
abd1ec54 518 for (my $i = 0; $i < @src; $i++) {
60f577e0 519 if ($vers <= 20 && _isIllegal($src[$i])) {
b5d9a953
CBW
520 $src[$i] = undef;
521 } elsif ($ver9) {
eaecde9a 522 $src[$i] = undef if exists $map->{ $src[$i] }
6d3c12b5 523 ? @{ $map->{ $src[$i] } } == 0
f443a335 524 : $uXS && _ignorable_simple($src[$i]); ### XS only
b5d9a953 525 }
0116f5dc
JH
526 }
527
a7fbee98 528 for (my $i = 0; $i < @src; $i++) {
91ae00cb 529 my $jcps = $src[$i];
3756e7ca
RGS
530
531 # skip removed code point
532 if (! defined $jcps) {
533 if ($wLen && @buf) {
534 $buf[-1][2] = $i + 1;
535 }
536 next;
537 }
538
abd1ec54 539 my $i_orig = $i;
4d36a948 540
3756e7ca 541 # find contraction
eaecde9a 542 if (exists $max->{$jcps}) {
91ae00cb
NC
543 my $temp_jcps = $jcps;
544 my $jcpsLen = 1;
545 my $maxLen = $max->{$jcps};
4d36a948 546
91ae00cb 547 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 548 next if ! defined $src[$p];
91ae00cb
NC
549 $temp_jcps .= CODE_SEP . $src[$p];
550 $jcpsLen++;
eaecde9a 551 if (exists $map->{$temp_jcps}) {
91ae00cb 552 $jcps = $temp_jcps;
4d36a948
TS
553 $i = $p;
554 }
555 }
4d36a948 556
b5d9a953 557 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
06c8fc8f 558 # This process requires Unicode::Normalize.
91ae00cb 559 # If "normalization" is undef, here should be skipped *always*
06c8fc8f
RGS
560 # (in spite of bool value of $CVgetCombinClass),
561 # since canonical ordering cannot be expected.
562 # Blocked combining character should not be contracted.
563
06c8fc8f 564 # $self->{normCode} is false in the case of "prenormalized".
b5d9a953
CBW
565 if ($self->{normalization}) {
566 my $cont = $self->{contraction};
06c8fc8f 567 my $preCC = 0;
b5d9a953
CBW
568 my $preCC_uc = 0;
569 my $jcps_uc = $jcps;
570 my(@out, @out_uc);
06c8fc8f
RGS
571
572 for (my $p = $i + 1; $p < @src; $p++) {
573 next if ! defined $src[$p];
b5d9a953 574 my $curCC = $CVgetCombinClass->($src[$p]);
06c8fc8f
RGS
575 last unless $curCC;
576 my $tail = CODE_SEP . $src[$p];
b5d9a953 577
eaecde9a 578 if ($preCC != $curCC && exists $map->{$jcps.$tail}) {
f8187d97
SH
579 $jcps .= $tail;
580 push @out, $p;
581 } else {
582 $preCC = $curCC;
583 }
584
585 next if !$long;
586
eaecde9a
CBW
587 if ($preCC_uc != $curCC &&
588 (exists $map->{$jcps_uc.$tail} ||
589 exists $cont->{$jcps_uc.$tail})) {
b5d9a953
CBW
590 $jcps_uc .= $tail;
591 push @out_uc, $p;
592 } else {
593 $preCC_uc = $curCC;
594 }
4d36a948 595 }
b5d9a953 596
eaecde9a 597 if (@out_uc && exists $map->{$jcps_uc}) {
b5d9a953
CBW
598 $jcps = $jcps_uc;
599 $src[$_] = undef for @out_uc;
600 } else {
601 $src[$_] = undef for @out;
602 }
a7fbee98 603 }
a7fbee98
JH
604 }
605
3756e7ca 606 # skip completely ignorable
eaecde9a 607 if (exists $map->{$jcps} ? @{ $map->{$jcps} } == 0 :
6d3c12b5 608 $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
3756e7ca
RGS
609 if ($wLen && @buf) {
610 $buf[-1][2] = $i + 1;
a7fbee98 611 }
3756e7ca 612 next;
a7fbee98 613 }
4d36a948 614
91ae00cb 615 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 616 }
4d36a948 617 return \@buf;
d16e9e3d 618}
45394607 619
539ce3d8
CBW
620##
621## VCE = _pack_override(input, codepoint, derivCode)
622##
623sub _pack_override ($$$) {
624 my $r = shift;
625 my $u = shift;
626 my $der = shift;
627
628 if (ref $r) {
629 return pack(VCE_TEMPLATE, NON_VAR, @$r);
630 } elsif (defined $r) {
631 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
632 } else {
d8e4b4ea 633 $u = 0xFFFD if 0x10FFFF < $u;
539ce3d8
CBW
634 return $der->($u);
635 }
636}
d16e9e3d
JH
637
638##
abd1ec54 639## list of VCE = getWt(JCPS)
d16e9e3d
JH
640##
641sub getWt
642{
a7fbee98 643 my $self = shift;
91ae00cb
NC
644 my $u = shift;
645 my $map = $self->{mapping};
0116f5dc 646 my $der = $self->{derivCode};
60f577e0 647 my $out = $self->{overrideOut};
19265284 648 my $uXS = $self->{__useXS}; ### XS only
a7fbee98 649
91ae00cb 650 return if !defined $u;
6d3c12b5
CBW
651 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
652 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
60f577e0
SH
653 $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
654
655 my @ce;
eaecde9a 656 if (exists $map->{$u}) {
60f577e0 657 @ce = @{ $map->{$u} }; # $u may be a contraction
19265284 658### begin XS only ###
60f577e0
SH
659 } elsif ($uXS && _exists_simple($u)) {
660 @ce = _fetch_simple($u);
19265284 661### end XS only ###
60f577e0 662 } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1 663 my $hang = $self->{overrideHangul};
1d2654e1 664 if ($hang) {
60f577e0 665 @ce = map _pack_override($_, $u, $der), $hang->($u);
539ce3d8 666 } elsif (!defined $hang) {
60f577e0 667 @ce = $der->($u);
539ce3d8 668 } else {
1d2654e1
JH
669 my $max = $self->{maxlength};
670 my @decH = _decompHangul($u);
671
672 if (@decH == 2) {
673 my $contract = join(CODE_SEP, @decH);
eaecde9a 674 @decH = ($contract) if exists $map->{$contract};
1d2654e1 675 } else { # must be <@decH == 3>
eaecde9a 676 if (exists $max->{$decH[0]}) {
1d2654e1 677 my $contract = join(CODE_SEP, @decH);
eaecde9a 678 if (exists $map->{$contract}) {
1d2654e1
JH
679 @decH = ($contract);
680 } else {
681 $contract = join(CODE_SEP, @decH[0,1]);
eaecde9a 682 exists $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1
JH
683 }
684 # even if V's ignorable, LT contraction is not supported.
f58b9ef1 685 # If such a situation were required, NFD should be used.
1d2654e1 686 }
eaecde9a 687 if (@decH == 3 && exists $max->{$decH[1]}) {
1d2654e1 688 my $contract = join(CODE_SEP, @decH[1,2]);
eaecde9a 689 exists $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1
JH
690 }
691 }
692
60f577e0 693 @ce = map({
eaecde9a 694 exists $map->{$_} ? @{ $map->{$_} } :
19265284 695 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
f58b9ef1 696 $der->($_);
1d2654e1
JH
697 } @decH);
698 }
60f577e0
SH
699 } elsif ($out && 0x10FFFF < $u) {
700 @ce = map _pack_override($_, $u, $der), $out->($u);
539ce3d8 701 } else {
584e761d
CBW
702 my $cjk = $self->{overrideCJK};
703 my $vers = $self->{UCA_Version};
704 if ($cjk && _isUIdeo($u, $vers)) {
60f577e0
SH
705 @ce = map _pack_override($_, $u, $der), $cjk->($u);
706 } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
707 @ce = _uideoCE_8($u);
708 } else {
709 @ce = $der->($u);
584e761d 710 }
a7fbee98 711 }
60f577e0 712 return map $self->varCE($_), @ce;
d16e9e3d
JH
713}
714
d16e9e3d
JH
715
716##
717## string sortkey = getSortKey(string arg)
718##
719sub getSortKey
720{
a7fbee98 721 my $self = shift;
750da838
CBW
722 my $orig = shift;
723 my $str = $self->process($orig);
724 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
00e00351 725 my $vers = $self->{UCA_Version};
aa7758f7 726 my $term = $self->{hangul_terminator};
750da838
CBW
727 my $lev = $self->{level};
728 my $iden = $self->{identical};
91ae00cb 729
abd1ec54 730 my @buf; # weight arrays
aa7758f7 731 if ($term) {
91ae00cb 732 my $preHST = '';
19265284 733 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
91ae00cb
NC
734 foreach my $jcps (@$rEnt) {
735 # weird things like VL, TL-contraction are not considered!
aa7758f7 736 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
91ae00cb
NC
737 if ($preHST && !$curHST || # hangul before non-hangul
738 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
739 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
740 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
aa7758f7 741 push @buf, $termCE;
91ae00cb
NC
742 }
743 $preHST = $curHST;
abd1ec54 744 push @buf, $self->getWt($jcps);
91ae00cb 745 }
aa7758f7
CBW
746 push @buf, $termCE if $preHST; # end at hangul
747 } else {
91ae00cb 748 foreach my $jcps (@$rEnt) {
abd1ec54 749 push @buf, $self->getWt($jcps);
91ae00cb
NC
750 }
751 }
752
750da838
CBW
753 my $rkey = $self->mk_SortKey(\@buf); ### XS only
754
755 if ($iden || $vers >= 26 && $lev == MaxLevel) {
756 $rkey .= LEVEL_SEP;
757 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
758 }
759 return $rkey;
45394607
JH
760}
761
762
763##
d16e9e3d 764## int compare = cmp(string a, string b)
45394607 765##
5398038e
TS
766sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
767sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
768sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
769sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
770sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
771sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
772sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607
JH
773
774##
d16e9e3d 775## list[strings] sorted = sort(list[strings] arg)
45394607 776##
a7fbee98
JH
777sub sort {
778 my $obj = shift;
779 return
780 map { $_->[1] }
781 sort{ $a->[0] cmp $b->[0] }
782 map [ $obj->getSortKey($_), $_ ], @_;
45394607
JH
783}
784
0116f5dc 785
4d36a948
TS
786##
787## bool _nonIgnorAtLevel(arrayref weights, int level)
788##
789sub _nonIgnorAtLevel($$)
790{
791 my $wt = shift;
792 return if ! defined $wt;
793 my $lv = shift;
9f1f04a1 794 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948
TS
795}
796
797##
798## bool _eqArray(
799## arrayref of arrayref[weights] source,
800## arrayref of arrayref[weights] substr,
801## int level)
802## * comparison of graphemes vs graphemes.
803## @$source >= @$substr must be true (check it before call this);
804##
805sub _eqArray($$$)
806{
807 my $source = shift;
808 my $substr = shift;
809 my $lev = shift;
810
811 for my $g (0..@$substr-1){
f58b9ef1 812 # Do the $g'th graphemes have the same number of AV weights?
4d36a948
TS
813 return if @{ $source->[$g] } != @{ $substr->[$g] };
814
815 for my $w (0..@{ $substr->[$g] }-1) {
816 for my $v (0..$lev-1) {
817 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
818 }
819 }
820 }
821 return 1;
822}
823
824##
825## (int position, int length)
17609435 826## int position = index(string, substring, position, [undoc'ed global])
4d36a948 827##
17609435 828## With "global" (only for the list context),
4d36a948
TS
829## returns list of arrayref[position, length].
830##
831sub index
832{
91ae00cb 833 my $self = shift;
750da838
CBW
834 $self->{preprocess} and
835 croak "Don't use Preprocess with index(), match(), etc.";
836 $self->{normCode} and
837 croak "Don't use Normalization with index(), match(), etc.";
838
91ae00cb
NC
839 my $str = shift;
840 my $len = length($str);
750da838
CBW
841 my $sub = shift;
842 my $subE = $self->splitEnt($sub);
91ae00cb
NC
843 my $pos = @_ ? shift : 0;
844 $pos = 0 if $pos < 0;
17609435 845 my $glob = shift;
91ae00cb
NC
846
847 my $lev = $self->{level};
3756e7ca
RGS
848 my $v2i = $self->{UCA_Version} >= 9 &&
849 $self->{variable} ne 'non-ignorable';
91ae00cb
NC
850
851 if (! @$subE) {
4d36a948 852 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
17609435 853 return $glob
4d36a948
TS
854 ? map([$_, 0], $temp..$len)
855 : wantarray ? ($temp,0) : $temp;
856 }
abd1ec54
NC
857 $len < $pos
858 and return wantarray ? () : NOMATCHPOS;
91ae00cb 859 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
abd1ec54
NC
860 @$strE
861 or return wantarray ? () : NOMATCHPOS;
862
4d36a948
TS
863 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
864
abd1ec54
NC
865 my $last_is_variable;
866 for my $vwt (map $self->getWt($_), @$subE) {
867 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
868 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 869
3756e7ca
RGS
870 # "Ignorable (L1, L2) after Variable" since track. v. 9
871 if ($v2i) {
abd1ec54
NC
872 if ($var) {
873 $last_is_variable = TRUE;
874 }
875 elsif (!$wt[0]) { # ignorable
4d36a948 876 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
877 }
878 else {
879 $last_is_variable = FALSE;
4d36a948
TS
880 }
881 }
882
abd1ec54
NC
883 if (@subWt && !$var && !$wt[0]) {
884 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
1393fe00 885 } elsif ($to_be_pushed) {
abd1ec54 886 push @subWt, [ \@wt ];
4d36a948 887 }
1393fe00 888 # else ===> skipped
4d36a948
TS
889 }
890
891 my $count = 0;
91ae00cb 892 my $end = @$strE - 1;
4d36a948 893
abd1ec54 894 $last_is_variable = FALSE; # reuse
4d36a948
TS
895 for (my $i = 0; $i <= $end; ) { # no $i++
896 my $found_base = 0;
897
898 # fetch a grapheme
899 while ($i <= $end && $found_base == 0) {
abd1ec54
NC
900 for my $vwt ($self->getWt($strE->[$i][0])) {
901 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
902 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 903
3756e7ca
RGS
904 # "Ignorable (L1, L2) after Variable" since track. v. 9
905 if ($v2i) {
abd1ec54
NC
906 if ($var) {
907 $last_is_variable = TRUE;
908 }
909 elsif (!$wt[0]) { # ignorable
4d36a948 910 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
911 }
912 else {
913 $last_is_variable = FALSE;
4d36a948
TS
914 }
915 }
916
abd1ec54
NC
917 if (@strWt && !$var && !$wt[0]) {
918 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
91ae00cb 919 $finPos[-1] = $strE->[$i][2];
4d36a948 920 } elsif ($to_be_pushed) {
abd1ec54 921 push @strWt, [ \@wt ];
91ae00cb 922 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 923 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 924 push @finPos, $strE->[$i][2];
4d36a948
TS
925 $found_base++;
926 }
927 # else ===> no-op
928 }
929 $i++;
930 }
931
932 # try to match
933 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
934 if ($iniPos[0] != NOMATCHPOS &&
935 $finPos[$#subWt] != NOMATCHPOS &&
936 _eqArray(\@strWt, \@subWt, $lev)) {
937 my $temp = $iniPos[0] + $pos;
938
17609435 939 if ($glob) {
4d36a948
TS
940 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
941 splice @strWt, 0, $#subWt;
942 splice @iniPos, 0, $#subWt;
943 splice @finPos, 0, $#subWt;
944 }
945 else {
946 return wantarray
947 ? ($temp, $finPos[$#subWt] - $iniPos[0])
948 : $temp;
949 }
950 }
951 shift @strWt;
952 shift @iniPos;
953 shift @finPos;
954 }
955 }
956
17609435 957 return $glob
4d36a948
TS
958 ? @g_ret
959 : wantarray ? () : NOMATCHPOS;
960}
961
962##
963## scalarref to matching part = match(string, substring)
964##
965sub match
966{
967 my $self = shift;
968 if (my($pos,$len) = $self->index($_[0], $_[1])) {
969 my $temp = substr($_[0], $pos, $len);
970 return wantarray ? $temp : \$temp;
971 # An lvalue ref \substr should be avoided,
972 # since its value is affected by modification of its referent.
973 }
974 else {
975 return;
976 }
977}
978
979##
980## arrayref matching parts = gmatch(string, substring)
981##
982sub gmatch
983{
984 my $self = shift;
985 my $str = shift;
986 my $sub = shift;
987 return map substr($str, $_->[0], $_->[1]),
988 $self->index($str, $sub, 0, 'g');
989}
990
991##
992## bool subst'ed = subst(string, substring, replace)
993##
994sub subst
995{
996 my $self = shift;
997 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
998
999 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1000 if ($code) {
1001 my $mat = substr($_[0], $pos, $len);
1002 substr($_[0], $pos, $len, $code->($mat));
1003 } else {
1004 substr($_[0], $pos, $len, $_[2]);
1005 }
1006 return TRUE;
1007 }
1008 else {
1009 return FALSE;
1010 }
1011}
1012
1013##
1014## int count = gsubst(string, substring, replace)
1015##
1016sub gsubst
1017{
1018 my $self = shift;
1019 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1020 my $cnt = 0;
1021
1022 # Replacement is carried out from the end, then use reverse.
1023 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1024 if ($code) {
1025 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1026 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1027 } else {
1028 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1029 }
1030 $cnt++;
1031 }
1032 return $cnt;
1033}
1034
45394607
JH
10351;
1036__END__
1037
1038=head1 NAME
1039
a7fbee98 1040Unicode::Collate - Unicode Collation Algorithm
45394607
JH
1041
1042=head1 SYNOPSIS
1043
1044 use Unicode::Collate;
1045
1046 #construct
5398038e 1047 $Collator = Unicode::Collate->new(%tailoring);
45394607
JH
1048
1049 #sort
5398038e 1050 @sorted = $Collator->sort(@not_sorted);
45394607
JH
1051
1052 #compare
a7fbee98 1053 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1054
539ce3d8
CBW
1055B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1056according to Perl's Unicode support. See L<perlunicode>,
1057L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1058Otherwise you can use C<preprocess> or should decode them before.
91ae00cb 1059
45394607
JH
1060=head1 DESCRIPTION
1061
3756e7ca
RGS
1062This module is an implementation of Unicode Technical Standard #10
1063(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
4d36a948 1064
45394607
JH
1065=head2 Constructor and Tailoring
1066
539ce3d8
CBW
1067The C<new> method returns a collator object. If new() is called
1068with no parameters, the collator should do the default collation.
d16e9e3d 1069
5398038e 1070 $Collator = Unicode::Collate->new(
0116f5dc 1071 UCA_Version => $UCA_Version,
aa7758f7 1072 alternate => $alternate, # alias for 'variable'
45394607
JH
1073 backwards => $levelNumber, # or \@levelNumbers
1074 entry => $element,
91ae00cb 1075 hangul_terminator => $term_primary_weight,
6d3c12b5 1076 highestFFFF => $bool,
750da838 1077 identical => $bool,
45394607
JH
1078 ignoreName => qr/$ignoreName/,
1079 ignoreChar => qr/$ignoreChar/,
19265284 1080 ignore_level2 => $bool,
45394607
JH
1081 katakana_before_hiragana => $bool,
1082 level => $collationLevel,
f8187d97 1083 long_contraction => $bool,
6d3c12b5 1084 minimalFFFE => $bool,
91ae00cb 1085 normalization => $normalization_form,
45394607
JH
1086 overrideCJK => \&overrideCJK,
1087 overrideHangul => \&overrideHangul,
1088 preprocess => \&preprocess,
1089 rearrange => \@charList,
19265284 1090 rewrite => \&rewrite,
aa7758f7 1091 suppress => \@charList,
45394607
JH
1092 table => $filename,
1093 undefName => qr/$undefName/,
1094 undefChar => qr/$undefChar/,
1095 upper_before_lower => $bool,
91ae00cb 1096 variable => $variable,
45394607 1097 );
45394607
JH
1098
1099=over 4
1100
0116f5dc
JH
1101=item UCA_Version
1102
7b98b857
CBW
1103If the revision (previously "tracking version") number of UCA is given,
1104behavior of that revision is emulated on collating.
0116f5dc 1105If omitted, the return value of C<UCA_Version()> is used.
3756e7ca 1106
6330ea69 1107The following revisions are supported. The default is 36.
3756e7ca 1108
0d50d293 1109 UCA Unicode Standard DUCET (@version)
b5d9a953 1110 -------------------------------------------------------
0d50d293
RGS
1111 8 3.1 3.0.1 (3.0.1d9)
1112 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1113 11 4.0 4.0.0 (4.0.0)
1114 14 4.1.0 4.1.0 (4.1.0)
74b94a79
CBW
1115 16 5.0 5.0.0 (5.0.0)
1116 18 5.1.0 5.1.0 (5.1.0)
00e00351 1117 20 5.2.0 5.2.0 (5.2.0)
b5d9a953 1118 22 6.0.0 6.0.0 (6.0.0)
cba8842c 1119 24 6.1.0 6.1.0 (6.1.0)
750da838 1120 26 6.2.0 6.2.0 (6.2.0)
2b488730 1121 28 6.3.0 6.3.0 (6.3.0)
c28567dd 1122 30 7.0.0 7.0.0 (7.0.0)
fcd4e2f8 1123 32 8.0.0 8.0.0 (8.0.0)
6608d2d5 1124 34 9.0.0 9.0.0 (9.0.0)
eaecde9a 1125 36 10.0.0 10.0.0(10.0.0)
3756e7ca 1126
cc2b46b5 1127* See below for C<long_contraction> with C<UCA_Version> 22 and 24.
f8187d97 1128
f58b9ef1 1129* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
b5d9a953
CBW
1130since C<UCA_Version> 22.
1131
60f577e0
SH
1132* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
1133and can be overridden since C<UCA_Version> 22.
1134
b5d9a953
CBW
1135* Fully ignorable characters were ignored, and would not interrupt
1136contractions with C<UCA_Version> 9 and 11.
1137
1138* Treatment of ignorables after variables and some behaviors
1139were changed at C<UCA_Version> 9.
1140
1141* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1142depend on C<UCA_Version>.
1143
1144* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1145C<hangul_terminator>.
1146
3756e7ca 1147=item alternate
0116f5dc 1148
3756e7ca 1149-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
0116f5dc 1150
3756e7ca
RGS
1151For backward compatibility, C<alternate> (old name) can be used
1152as an alias for C<variable>.
0116f5dc 1153
45394607
JH
1154=item backwards
1155
750da838 1156-- see 3.4 Backward Accents, UTS #10.
45394607
JH
1157
1158 backwards => $levelNumber or \@levelNumbers
1159
1160Weights in reverse order; ex. level 2 (diacritic ordering) in French.
68adb2b0
CBW
1161If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1162forwards at all the levels.
45394607
JH
1163
1164=item entry
1165
f8187d97 1166-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
45394607 1167
91ae00cb
NC
1168If the same character (or a sequence of characters) exists
1169in the collation element table through C<table>,
f58b9ef1 1170mapping to collation elements is overridden.
91ae00cb 1171If it does not exist, the mapping is defined additionally.
45394607 1172
abd1ec54
NC
1173 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11740063 0068 ; [.0E6A.0020.0002.0063] # ch
11750043 0068 ; [.0E6A.0020.0007.0043] # Ch
11760043 0048 ; [.0E6A.0020.0008.0043] # CH
1177006C 006C ; [.0F4C.0020.0002.006C] # ll
1178004C 006C ; [.0F4C.0020.0007.004C] # Ll
1179004C 004C ; [.0F4C.0020.0008.004C] # LL
e7f779c8
RGS
118000F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1181006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
118200D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1183004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
abd1ec54
NC
1184ENTRY
1185
1186 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
118700E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
118800C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1189ENTRY
45394607 1190
4d36a948 1191B<NOTE:> The code point in the UCA file format (before C<';'>)
abd1ec54
NC
1192B<must> be a Unicode code point (defined as hexadecimal),
1193but not a native code point.
4d36a948
TS
1194So C<0063> must always denote C<U+0063>,
1195but not a character of C<"\x63">.
1196
abd1ec54
NC
1197Weighting may vary depending on collation element table.
1198So ensure the weights defined in C<entry> will be consistent with
1199those in the collation element table loaded via C<table>.
1200
1201In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1202and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1203(as a value between C<0E60> and C<0E6D>)
1204makes ordering as C<C E<lt> CH E<lt> D>.
1205Exactly speaking DUCET already has some characters between C<C> and C<D>:
1206C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1207C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1208and C<c-curl> (C<U+0255>) with C<0E69>.
1209Then primary weight C<0E6A> for C<CH> makes C<CH>
1210ordered between C<c-curl> and C<D>.
1211
91ae00cb
NC
1212=item hangul_terminator
1213
3756e7ca 1214-- see 7.1.4 Trailing Weights, UTS #10.
91ae00cb
NC
1215
1216If a true value is given (non-zero but should be positive),
1217it will be added as a terminator primary weight to the end of
1218every standard Hangul syllable. Secondary and any higher weights
1219for terminator are set to zero.
1220If the value is false or C<hangul_terminator> key does not exist,
1221insertion of terminator weights will not be performed.
1222
1223Boundaries of Hangul syllables are determined
1224according to conjoining Jamo behavior in F<the Unicode Standard>
1225and F<HangulSyllableType.txt>.
1226
1227B<Implementation Note:>
1228(1) For expansion mapping (Unicode character mapped
1229to a sequence of collation elements), a terminator will not be added
1230between collation elements, even if Hangul syllable boundary exists there.
1231Addition of terminator is restricted to the next position
1232to the last collation element.
1233
1234(2) Non-conjoining Hangul letters
1235(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1236automatically terminated with a terminator primary weight.
1237These characters may need terminator included in a collation element
1238table beforehand.
1239
6d3c12b5
CBW
1240=item highestFFFF
1241
cc2b46b5 1242-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation.
6d3c12b5
CBW
1243
1244If the parameter is made true, C<U+FFFF> has a highest primary weight.
1245When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1246C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1247begins with C<"abc">, or another primary equivalent.
1248C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1249such as C<"abc\x{FFFF}xyz">.
1250
1251C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
ed2081ad 1252almost, but the latter has a problem that you should know which letter is
6d3c12b5 1253next to C<c>. For a certain language where C<ch> as the next letter,
ed2081ad 1254C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
6d3c12b5 1255
60f577e0
SH
1256Note:
1257This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
f443a335 1258Any other character than C<U+FFFF> can be tailored by C<entry>.
6d3c12b5 1259
750da838
CBW
1260=item identical
1261
1262-- see A.3 Deterministic Comparison, UTS #10.
1263
1264By default, strings whose weights are equal should be equal,
1265even though their code points are not equal.
5c77070f 1266Completely ignorable characters are ignored.
750da838
CBW
1267
1268If the parameter is made true, a final, tie-breaking level is used.
5c77070f
CBW
1269If no difference of weights is found after the comparison through
1270all the level specified by C<level>, the comparison with code points
1271will be performed.
ed2081ad 1272For the tie-breaking comparison, the sort key has code points
5c77070f
CBW
1273of the original string appended.
1274Completely ignorable characters are not ignored.
750da838
CBW
1275
1276If C<preprocess> and/or C<normalization> is applied, the code points
1277of the string after them (in NFD by default) are used.
1278
45394607
JH
1279=item ignoreChar
1280
3756e7ca
RGS
1281=item ignoreName
1282
f8187d97 1283-- see 3.6 Variable Weighting, UTS #10.
45394607 1284
caffd4cf
TS
1285Makes the entry in the table completely ignorable;
1286i.e. as if the weights were zero at all level.
45394607 1287
3756e7ca
RGS
1288Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1289will be ignored. Through C<ignoreName>, any character whose name
1290(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1291will be ignored.
1292
a7fbee98 1293E.g. when 'a' and 'e' are ignorable,
45394607
JH
1294'element' is equal to 'lament' (or 'lmnt').
1295
19265284
CBW
1296=item ignore_level2
1297
1298-- see 5.1 Parametric Tailoring, UTS #10.
1299
1300By default, case-sensitive comparison (that is level 3 difference)
1301won't ignore accents (that is level 2 difference).
1302
1303If the parameter is made true, accents (and other primary ignorable
1304characters) are ignored, even though cases are taken into account.
1305
1306B<NOTE>: C<level> should be 3 or greater.
1307
3756e7ca
RGS
1308=item katakana_before_hiragana
1309
750da838 1310-- see 7.2 Tertiary Weight Table, UTS #10.
3756e7ca
RGS
1311
1312By default, hiragana is before katakana.
1313If the parameter is made true, this is reversed.
1314
1315B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1316distinctions must occur in level 3, and their weights at level 3 must be
1317same as those mentioned in 7.3.1, UTS #10.
1318If you define your collation elements which violate this requirement,
1319this parameter does not work validly.
1320
45394607
JH
1321=item level
1322
3756e7ca 1323-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1324
1325Set the maximum level.
1326Any higher levels than the specified one are ignored.
1327
1328 Level 1: alphabetic ordering
1329 Level 2: diacritic ordering
1330 Level 3: case ordering
91ae00cb 1331 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607
JH
1332
1333 ex.level => 2,
1334
a7fbee98
JH
1335If omitted, the maximum is the 4th.
1336
750da838
CBW
1337B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1338But this module only uses weights within 0xFFFF.
1339When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1340and 'shift-trimmed'), the level 4 may be unreliable.
1341
1342See also C<identical>.
1343
f8187d97
SH
1344=item long_contraction
1345
1346-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
1347
1348If the parameter is made true, for a contraction with three or more
1349characters (here nicknamed "long contraction"), initial substrings
1350will be handled.
1351For example, a contraction ABC, where A is a starter, and B and C
1352are non-starters (character with non-zero combining character class),
1353will be detected even if there is not AB as a contraction.
1354
1355B<Default:> Usually false.
1356If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
1357is not specified in C<new()>, a true value is set implicitly.
1358This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
1359
1360C<change()> handles C<long_contraction> explicitly only.
1361If C<long_contraction> is not specified in C<change()>, even though
1362C<UCA_Version> is changed, C<long_contraction> will not be changed.
1363
1364B<Limitation:> Scanning non-starters is one-way (no back tracking).
1365If AB is found but not ABC is not found, other long contraction where
1366the first character is A and the second is not B may not be found.
1367
1368Under C<(normalization =E<gt> undef)>, detection step of discontiguous
c28567dd 1369contractions will be skipped.
f8187d97
SH
1370
1371B<Note:> The following contractions in DUCET are not considered
1372in steps S2.1.1 to S2.1.3, where they are discontiguous.
1373
1374 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
1375 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
1376
1377For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
1378(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
1379In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
1380instead of C<0FB2 0F71 0F80>.
1381Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
1382contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
1383
6d3c12b5
CBW
1384=item minimalFFFE
1385
cc2b46b5 1386-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation.
6d3c12b5
CBW
1387
1388If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1389The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1390first compares C<$a1> and C<$b1> at level 1, and
1391then C<$a2> and C<$b2> at level 1, as followed.
1392
1393 "ab\x{FFFE}a"
1394 "Ab\x{FFFE}a"
1395 "ab\x{FFFE}c"
1396 "Ab\x{FFFE}c"
1397 "ab\x{FFFE}xyz"
1398 "abc\x{FFFE}def"
1399 "abc\x{FFFE}xYz"
1400 "aBc\x{FFFE}xyz"
1401 "abcX\x{FFFE}def"
1402 "abcx\x{FFFE}xyz"
1403 "b\x{FFFE}aaa"
1404 "bbb\x{FFFE}a"
1405
60f577e0
SH
1406Note:
1407This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
f443a335 1408Any other character than C<U+FFFE> can be tailored by C<entry>.
6d3c12b5 1409
45394607
JH
1410=item normalization
1411
3756e7ca 1412-- see 4.1 Normalize, UTS #10.
45394607 1413
905aa9f0 1414If specified, strings are normalized before preparation of sort keys
45394607
JH
1415(the normalization is executed after preprocess).
1416
1d2654e1
JH
1417A form name C<Unicode::Normalize::normalize()> accepts will be applied
1418as C<$normalization_form>.
06c8fc8f 1419Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1
JH
1420See C<Unicode::Normalize::normalize()> for detail.
1421If omitted, C<'NFD'> is used.
45394607 1422
91ae00cb 1423C<normalization> is performed after C<preprocess> (if defined).
45394607 1424
06c8fc8f
RGS
1425Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1426though they are not concerned with C<Unicode::Normalize::normalize()>.
1427
1428If C<undef> (not a string C<"undef">) is passed explicitly
1429as the value for this key,
45394607 1430any normalization is not carried out (this may make tailoring easier
abd1ec54
NC
1431if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1432only contiguous contractions are resolved;
1433e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1434C<A-cedilla-ring> would be primary equal to C<A>.
06c8fc8f
RGS
1435In this point,
1436C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1437B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1438
1439In the case of C<(normalization =E<gt> "prenormalized")>,
1440any normalization is not performed, but
b5d9a953 1441discontiguous contractions with combining characters are performed.
06c8fc8f
RGS
1442Therefore
1443C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1444B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1445If source strings are finely prenormalized,
1446C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1447
1448Except C<(normalization =E<gt> undef)>,
1449B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607
JH
1450
1451=item overrideCJK
1452
4d36a948 1453-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1454
584e761d 1455By default, CJK unified ideographs are ordered in Unicode codepoint
ed2081ad 1456order, but those in the CJK Unified Ideographs block are less than
584e761d 1457those in the CJK Unified Ideographs Extension A etc.
00e00351 1458
b5d9a953 1459 In the CJK Unified Ideographs block:
cba8842c
A
1460 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1461 U+4E00..U+9FBB if UCA_Version is 14 or 16.
b5d9a953 1462 U+4E00..U+9FC3 if UCA_Version is 18.
cba8842c 1463 U+4E00..U+9FCB if UCA_Version is 20 or 22.
fcd4e2f8 1464 U+4E00..U+9FCC if UCA_Version is 24 to 30.
6608d2d5 1465 U+4E00..U+9FD5 if UCA_Version is 32 or 34.
eaecde9a 1466 U+4E00..U+9FEA if UCA_Version is 36.
00e00351 1467
b5d9a953
CBW
1468 In the CJK Unified Ideographs Extension blocks:
1469 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
2b488730
CBW
1470 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
1471 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
6608d2d5 1472 Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32 or later.
eaecde9a 1473 Ext.F (U+2CEB0..U+2EBE0) if UCA_Version is 36.
91ae00cb 1474
584e761d 1475Through C<overrideCJK>, ordering of CJK unified ideographs (including
f58b9ef1 1476extensions) can be overridden.
45394607 1477
584e761d 1478ex. CJK unified ideographs in the JIS code point order.
45394607
JH
1479
1480 overrideCJK => sub {
a7fbee98
JH
1481 my $u = shift; # get a Unicode codepoint
1482 my $b = pack('n', $u); # to UTF-16BE
1483 my $s = your_unicode_to_sjis_converter($b); # convert
1484 my $n = unpack('n', $s); # convert sjis to short
1485 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607
JH
1486 },
1487
539ce3d8
CBW
1488The return value may be an arrayref of 1st to 4th weights as shown
1489above. The return value may be an integer as the primary weight
1490as shown below. If C<undef> is returned, the default derived
1491collation element will be used.
1492
1493 overrideCJK => sub {
1494 my $u = shift; # get a Unicode codepoint
1495 my $b = pack('n', $u); # to UTF-16BE
1496 my $s = your_unicode_to_sjis_converter($b); # convert
1497 my $n = unpack('n', $s); # convert sjis to short
1498 return $n; # return the primary weight
1499 },
1500
1501The return value may be a list containing zero or more of
1502an arrayref, an integer, or C<undef>.
1503
584e761d 1504ex. ignores all CJK unified ideographs.
a7fbee98
JH
1505
1506 overrideCJK => sub {()}, # CODEREF returning empty list
1507
1508 # where ->eq("Pe\x{4E00}rl", "Perl") is true
584e761d 1509 # as U+4E00 is a CJK unified ideograph and to be ignorable.
a7fbee98 1510
60f577e0
SH
1511If a false value (including C<undef>) is passed, C<overrideCJK>
1512has no effect.
1513C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
1514
584e761d
CBW
1515But assignment of weight for CJK unified ideographs
1516in C<table> or C<entry> is still valid.
60f577e0
SH
1517If C<undef> is passed explicitly as the value for this key,
1518weights for CJK unified ideographs are treated as undefined.
1519However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
1520has no special meaning.
584e761d
CBW
1521
1522B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1523C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1524C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
f58b9ef1 1525ideographs. But they can't be overridden via C<overrideCJK> when you use
584e761d
CBW
1526DUCET, as the table includes weights for them. C<table> or C<entry> has
1527priority over C<overrideCJK>.
a7fbee98
JH
1528
1529=item overrideHangul
1530
4d36a948 1531-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1532
584e761d 1533By default, Hangul syllables are decomposed into Hangul Jamo,
abd1ec54 1534even if C<(normalization =E<gt> undef)>.
f58b9ef1 1535But the mapping of Hangul syllables may be overridden.
a7fbee98 1536
3756e7ca 1537This parameter works like C<overrideCJK>, so see there for examples.
a7fbee98 1538
584e761d
CBW
1539If you want to override the mapping of Hangul syllables,
1540NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1541Hangul syllables before overriding. FCD may decompose Hangul syllables
1542as the case may be.
45394607 1543
60f577e0
SH
1544If a false value (but not C<undef>) is passed, C<overrideHangul>
1545has no effect.
1546C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
1547
a7fbee98 1548If C<undef> is passed explicitly as the value for this key,
584e761d 1549weight for Hangul syllables is treated as undefined
a7fbee98 1550without decomposition into Hangul Jamo.
584e761d
CBW
1551But definition of weight for Hangul syllables
1552in C<table> or C<entry> is still valid.
a7fbee98 1553
60f577e0
SH
1554=item overrideOut
1555
1556-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
1557
1558Perl seems to allow out-of-range values (greater than 0x10FFFF).
1559By default, out-of-range values are replaced with C<U+FFFD>
1560(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
1561or ignored when C<UCA_Version> E<lt>= 20.
1562
1563When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
1564can be overridden. Though C<table> or C<entry> are available for them,
1565out-of-range values are too many.
1566
1567C<overrideOut> can perform it algorithmically.
1568This parameter works like C<overrideCJK>, so see there for examples.
1569
1570ex. ignores all out-of-range values.
1571
1572 overrideOut => sub {()}, # CODEREF returning empty list
1573
1574If a false value (including C<undef>) is passed, C<overrideOut>
1575has no effect.
1576C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
1577
d8e4b4ea
CBW
1578B<NOTE ABOUT U+FFFD:>
1579
60f577e0
SH
1580UCA recommends that out-of-range values should not be ignored for security
1581reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
1582However, C<U+FFFD> is wrongly mapped to a variable collation element
1583in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
1584ignored when C<variable> isn't C<Non-ignorable>.
1585
d8e4b4ea
CBW
1586The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
1587see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
1588(7.1.4 Trailing Weights). Such a correction is reproduced by this.
60f577e0
SH
1589
1590 overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
1591
d8e4b4ea 1592This workaround is unnecessary since Unicode 6.3.0.
60f577e0 1593
45394607
JH
1594=item preprocess
1595
750da838 1596-- see 5.4 Preprocessing, UTS #10.
45394607 1597
19265284 1598If specified, the coderef is used to preprocess each string
45394607
JH
1599before the formation of sort keys.
1600
a7fbee98 1601ex. dropping English articles, such as "a" or "the".
45394607
JH
1602Then, "the pen" is before "a pencil".
1603
1604 preprocess => sub {
1605 my $str = shift;
a7fbee98 1606 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1607 return $str;
45394607
JH
1608 },
1609
91ae00cb 1610C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1611
539ce3d8
CBW
1612ex. decoding strings in a legacy encoding such as shift-jis:
1613
1614 $sjis_collator = Unicode::Collate->new(
1615 preprocess => \&your_shiftjis_to_unicode_decoder,
1616 );
1617 @result = $sjis_collator->sort(@shiftjis_strings);
1618
1619B<Note:> Strings returned from the coderef will be interpreted
1620according to Perl's Unicode support. See L<perlunicode>,
1621L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1622
45394607
JH
1623=item rearrange
1624
750da838 1625-- see 3.5 Rearrangement, UTS #10.
45394607
JH
1626
1627Characters that are not coded in logical order and to be rearranged.
ed2081ad 1628If C<UCA_Version> is equal to or less than 11, default is:
45394607
JH
1629
1630 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1631
3756e7ca
RGS
1632If you want to disallow any rearrangement, pass C<undef> or C<[]>
1633(a reference to empty list) as the value for this key.
1634
74b94a79
CBW
1635If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1636(i.e. no rearrangement).
a7fbee98 1637
0116f5dc
JH
1638B<According to the version 9 of UCA, this parameter shall not be used;
1639but it is not warned at present.>
1640
19265284
CBW
1641=item rewrite
1642
1643If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1644The coderef will get each line, and then should return a rewritten line
1645according to the UCA file format.
1646If the coderef returns an empty line, the line will be skipped.
1647
1648e.g. any primary ignorable characters into tertiary ignorable:
1649
1650 rewrite => sub {
1651 my $line = shift;
1652 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1653 return $line;
1654 },
1655
1656This example shows rewriting weights. C<rewrite> is allowed to
1657affect code points, weights, and the name.
1658
1659B<NOTE>: C<table> is available to use another table file;
1660preparing a modified table once would be more efficient than
1661rewriting lines on reading an unmodified table every time.
1662
aa7758f7
CBW
1663=item suppress
1664
cc2b46b5 1665-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation.
aa7758f7 1666
539ce3d8 1667Contractions beginning with the specified characters are suppressed,
bd65daab 1668even if those contractions are defined in C<table>.
aa7758f7
CBW
1669
1670An example for Russian and some languages using the Cyrillic script:
1671
1672 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1673
1674where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1675
fcd4e2f8 1676B<NOTE>: Contractions via C<entry> will not be suppressed.
bd65daab 1677
45394607
JH
1678=item table
1679
f8187d97 1680-- see 3.8 Default Unicode Collation Element Table, UTS #10.
45394607 1681
91ae00cb 1682You can use another collation element table if desired.
45394607 1683
e7f779c8 1684The table file should locate in the F<Unicode/Collate> directory
3756e7ca
RGS
1685on C<@INC>. Say, if the filename is F<Foo.txt>,
1686the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
e7f779c8
RGS
1687
1688By default, F<allkeys.txt> (as the filename of DUCET) is used.
0d50d293
RGS
1689If you will prepare your own table file, any name other than F<allkeys.txt>
1690may be better to avoid namespace conflict.
45394607 1691
00e00351
CBW
1692B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1693module, and it may save time at the run time.
60f577e0 1694Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
19265284
CBW
1695or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1696C<rewrite> will prevent this module from using the compiled DUCET.
00e00351 1697
a7fbee98 1698If C<undef> is passed explicitly as the value for this key,
91ae00cb 1699no file is read (but you can define collation elements via C<entry>).
a7fbee98
JH
1700
1701A typical way to define a collation element table
1702without any file of table:
1703
1704 $onlyABC = Unicode::Collate->new(
1705 table => undef,
1706 entry => << 'ENTRIES',
17070061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
17080041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
17090062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
17100042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
17110063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
17120043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1713ENTRIES
1714 );
905aa9f0 1715
3756e7ca
RGS
1716If C<ignoreName> or C<undefName> is used, character names should be
1717specified as a comment (following C<#>) on each line.
45394607
JH
1718
1719=item undefChar
1720
3756e7ca
RGS
1721=item undefName
1722
cc2b46b5 1723-- see 6.3.3 Reducing the Repertoire, UTS #10.
45394607 1724
584e761d 1725Undefines the collation element as if it were unassigned in the C<table>.
45394607
JH
1726This reduces the size of the table.
1727If an unassigned character appears in the string to be collated,
1728the sort key is made from its codepoint
1729as a single-character collation element,
1730as it is greater than any other assigned collation elements
1731(in the codepoint order among the unassigned characters).
1732But, it'd be better to ignore characters
1733unfamiliar to you and maybe never used.
1734
3756e7ca
RGS
1735Through C<undefChar>, any character matching C<qr/$undefChar/>
1736will be undefined. Through C<undefName>, any character whose name
1737(given in the C<table> file as a comment) matches C<qr/$undefName/>
1738will be undefined.
1739
e7f779c8
RGS
1740ex. Collation weights for beyond-BMP characters are not stored in object:
1741
1742 undefChar => qr/[^\0-\x{fffd}]/,
1743
45394607
JH
1744=item upper_before_lower
1745
3756e7ca 1746-- see 6.6 Case Comparisons, UTS #10.
45394607 1747
3756e7ca
RGS
1748By default, lowercase is before uppercase.
1749If the parameter is made true, this is reversed.
45394607 1750
3756e7ca
RGS
1751B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1752distinctions must occur in level 3, and their weights at level 3 must be
1753same as those mentioned in 7.3.1, UTS #10.
1754If you define your collation elements which differs from this requirement,
1755this parameter doesn't work validly.
45394607 1756
91ae00cb
NC
1757=item variable
1758
f8187d97 1759-- see 3.6 Variable Weighting, UTS #10.
91ae00cb 1760
bd65daab 1761This key allows for variable weighting of variable collation elements,
91ae00cb 1762which are marked with an ASTERISK in the table
f58b9ef1 1763(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
91ae00cb
NC
1764
1765 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1766
1767These names are case-insensitive.
1768By default (if specification is omitted), 'shifted' is adopted.
1769
1770 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1771 considered at the 4th level.
1772
abd1ec54 1773 'Non-Ignorable' Variable elements are not reset to ignorable.
91ae00cb
NC
1774
1775 'Shifted' Variable elements are made ignorable at levels 1 through 3
1776 their level 4 weight is replaced by the old level 1 weight.
1777 Level 4 weight for Non-Variable elements is 0xFFFF.
1778
1779 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1780 are trimmed.
1781
45394607
JH
1782=back
1783
3164dd77 1784=head2 Methods for Collation
45394607
JH
1785
1786=over 4
1787
5398038e 1788=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607
JH
1789
1790Sorts a list of strings.
1791
5398038e 1792=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607
JH
1793
1794Returns 1 (when C<$a> is greater than C<$b>)
1795or 0 (when C<$a> is equal to C<$b>)
ed2081ad 1796or -1 (when C<$a> is less than C<$b>).
45394607 1797
5398038e
TS
1798=item C<$result = $Collator-E<gt>eq($a, $b)>
1799
1800=item C<$result = $Collator-E<gt>ne($a, $b)>
1801
1802=item C<$result = $Collator-E<gt>lt($a, $b)>
1803
1804=item C<$result = $Collator-E<gt>le($a, $b)>
1805
1806=item C<$result = $Collator-E<gt>gt($a, $b)>
1807
1808=item C<$result = $Collator-E<gt>ge($a, $b)>
1809
a7fbee98 1810They works like the same name operators as theirs.
5398038e
TS
1811
1812 eq : whether $a is equal to $b.
1813 ne : whether $a is not equal to $b.
ed2081ad
CBW
1814 lt : whether $a is less than $b.
1815 le : whether $a is less than $b or equal to $b.
5398038e
TS
1816 gt : whether $a is greater than $b.
1817 ge : whether $a is greater than $b or equal to $b.
1818
1819=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1820
3756e7ca 1821-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1822
1823Returns a sort key.
1824
1825You compare the sort keys using a binary comparison
1826and get the result of the comparison of the strings using UCA.
1827
5398038e 1828 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607
JH
1829
1830 is equivalent to
1831
5398038e 1832 $Collator->cmp($a, $b)
45394607 1833
a7fbee98
JH
1834=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1835
3756e7ca
RGS
1836Converts a sorting key into its representation form.
1837If C<UCA_Version> is 8, the output is slightly different.
1838
a7fbee98
JH
1839 use Unicode::Collate;
1840 my $c = Unicode::Collate->new();
1841 print $c->viewSortKey("Perl"),"\n";
1842
0116f5dc
JH
1843 # output:
1844 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1845 # Level 1 Level 2 Level 3 Level 4
1846
4d36a948
TS
1847=back
1848
1849=head2 Methods for Searching
d16e9e3d 1850
4d36a948
TS
1851The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1852like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1853but they are not aware of any pattern, but only a literal substring.
1854
19265284
CBW
1855B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1856for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1857C<subst>, C<gsubst>) is croaked, as the position and the length might
1858differ from those on the specified string.
1859
1860C<rearrange> and C<hangul_terminator> parameters are neglected.
1861C<katakana_before_hiragana> and C<upper_before_lower> don't affect
ed2081ad 1862matching and searching, as it doesn't matter whether greater or less.
19265284 1863
4d36a948
TS
1864=over 4
1865
1866=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1867
1868=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d
JH
1869
1870If C<$substring> matches a part of C<$string>, returns
1871the position of the first occurrence of the matching part in scalar context;
1872in list context, returns a two-element list of
1873the position and the length of the matching part.
1874
d16e9e3d
JH
1875If C<$substring> does not match any part of C<$string>,
1876returns C<-1> in scalar context and
1877an empty list in list context.
1878
8c1c815e
CBW
1879e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
1880you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
d16e9e3d 1881
5398038e 1882 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1883 # (normalization => undef) is REQUIRED.
d16e9e3d 1884 my $match;
a7fbee98 1885 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1886 $match = substr($str, $pos, $len);
d16e9e3d
JH
1887 }
1888
8c1c815e
CBW
1889and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
1890is primary equal to C<"M>E<uuml>C<SS">.
4d36a948
TS
1891
1892=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1893
1894=item C<($match) = $Collator-E<gt>match($string, $substring)>
1895
1896If C<$substring> matches a part of C<$string>, in scalar context, returns
1897B<a reference to> the first occurrence of the matching part
1898(C<$match_ref> is always true if matches,
1899since every reference is B<true>);
1900in list context, returns the first occurrence of the matching part.
1901
1902If C<$substring> does not match any part of C<$string>,
1903returns C<undef> in scalar context and
1904an empty list in list context.
1905
1906e.g.
1907
1908 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1909 print "matches [$$match_ref].\n";
1910 } else {
1911 print "doesn't match.\n";
1912 }
1913
3756e7ca 1914 or
4d36a948
TS
1915
1916 if (($match) = $Collator->match($str, $sub)) { # list context
1917 print "matches [$match].\n";
1918 } else {
1919 print "doesn't match.\n";
1920 }
1921
1922=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1923
1924If C<$substring> matches a part of C<$string>, returns
1925all the matching parts (or matching count in scalar context).
1926
1927If C<$substring> does not match any part of C<$string>,
1928returns an empty list.
1929
1930=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1931
1932If C<$substring> matches a part of C<$string>,
1933the first occurrence of the matching part is replaced by C<$replacement>
19265284 1934(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
4d36a948
TS
1935
1936C<$replacement> can be a C<CODEREF>,
1937taking the matching part as an argument,
1938and returning a string to replace the matching part
1939(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1940
1941=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1942
1943If C<$substring> matches a part of C<$string>,
19265284
CBW
1944all the occurrences of the matching part are replaced by C<$replacement>
1945(C<$string> is modified) and C<$count> is returned.
4d36a948
TS
1946
1947C<$replacement> can be a C<CODEREF>,
1948taking the matching part as an argument,
1949and returning a string to replace the matching part
1950(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1951
1952e.g.
1953
1954 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1955 # (normalization => undef) is REQUIRED.
19265284 1956 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
4d36a948
TS
1957 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1958
19265284 1959 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
4d36a948 1960 # i.e., all the camels are made bold-faced.
d16e9e3d 1961
19265284
CBW
1962 Examples: levels and ignore_level2 - what does camel match?
1963 ---------------------------------------------------------------------------
1964 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l
1965 -----------------------|---------------------------------------------------
1966 1 false | yes yes yes yes yes
1967 2 false | yes yes no yes yes
1968 3 false | yes no no yes yes
1969 4 false | yes no no no yes
1970 -----------------------|---------------------------------------------------
1971 1 true | yes yes yes yes yes
1972 2 true | yes yes yes yes yes
1973 3 true | yes no yes yes yes
1974 4 true | yes no yes no yes
1975 ---------------------------------------------------------------------------
1976 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1977 at any level.
1978
45394607
JH
1979=back
1980
3164dd77
TS
1981=head2 Other Methods
1982
1983=over 4
1984
0116f5dc
JH
1985=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1986
68adb2b0
CBW
1987=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1988
19265284 1989Changes the value of specified keys and returns the changed part.
0116f5dc
JH
1990
1991 $Collator = Unicode::Collate->new(level => 4);
1992
1993 $Collator->eq("perl", "PERL"); # false
1994
1995 %old = $Collator->change(level => 2); # returns (level => 4).
1996
1997 $Collator->eq("perl", "PERL"); # true
1998
1999 $Collator->change(%old); # returns (level => 2).
2000
2001 $Collator->eq("perl", "PERL"); # false
2002
2003Not all C<(key,value)>s are allowed to be changed.
2004See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
2005
2006In the scalar context, returns the modified collator
2007(but it is B<not> a clone from the original).
2008
2009 $Collator->change(level => 2)->eq("perl", "PERL"); # true
2010
2011 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
2012
2013 $Collator->change(level => 4)->eq("perl", "PERL"); # false
2014
91ae00cb 2015=item C<$version = $Collator-E<gt>version()>
3164dd77 2016
91ae00cb
NC
2017Returns the version number (a string) of the Unicode Standard
2018which the C<table> file used by the collator object is based on.
2019If the table does not include a version line (starting with C<@version>),
2020returns C<"unknown">.
2021
2022=item C<UCA_Version()>
3164dd77 2023
7b98b857
CBW
2024Returns the revision number of UTS #10 this module consults,
2025that should correspond with the DUCET incorporated.
3164dd77 2026
91ae00cb
NC
2027=item C<Base_Unicode_Version()>
2028
7b98b857
CBW
2029Returns the version number of UTS #10 this module consults,
2030that should correspond with the DUCET incorporated.
3164dd77
TS
2031
2032=back
2033
3756e7ca
RGS
2034=head1 EXPORT
2035
2036No method will be exported.
45394607 2037
0d50d293
RGS
2038=head1 INSTALL
2039
2040Though this module can be used without any C<table> file,
2041to use this module easily, it is recommended to install a table file
2042in the UCA format, by copying it under the directory
2043<a place in @INC>/Unicode/Collate.
2044
6d24ed10
SP
2045The most preferable one is "The Default Unicode Collation Element Table"
2046(aka DUCET), available from the Unicode Consortium's website:
0d50d293
RGS
2047
2048 http://www.unicode.org/Public/UCA/
2049
6608d2d5
CBW
2050 http://www.unicode.org/Public/UCA/latest/allkeys.txt
2051 (latest version)
0d50d293
RGS
2052
2053If DUCET is not installed, it is recommended to copy the file
2054from http://www.unicode.org/Public/UCA/latest/allkeys.txt
2055to <a place in @INC>/Unicode/Collate/allkeys.txt
2056manually.
2057
3756e7ca
RGS
2058=head1 CAVEATS
2059
2060=over 4
45394607 2061
3756e7ca 2062=item Normalization
45394607 2063
3756e7ca
RGS
2064Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2065module (see L<Unicode::Normalize>).
45394607 2066
5398038e 2067If you need not it (say, in the case when you need not
45394607 2068handle any combining characters),
60f577e0 2069assign C<(normalization =E<gt> undef)> explicitly.
45394607 2070
4d36a948 2071-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 2072
3756e7ca 2073=item Conformance Test
0116f5dc 2074
10d7ec48
NC
2075The Conformance Test for the UCA is available
2076under L<http://www.unicode.org/Public/UCA/>.
0116f5dc
JH
2077
2078For F<CollationTest_SHIFTED.txt>,
2079a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2080for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 2081C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 2082
750da838
CBW
2083If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
2084C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
2085C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
2086C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
2087
4d36a948 2088B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 2089
3756e7ca
RGS
2090=back
2091
0d50d293 2092=head1 AUTHOR, COPYRIGHT AND LICENSE
45394607 2093
0d50d293 2094The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
6330ea69 2095<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2018,
0d50d293 2096SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 2097
0d50d293
RGS
2098This module is free software; you can redistribute it and/or
2099modify it under the same terms as Perl itself.
45394607 2100
00e00351 2101The file Unicode/Collate/allkeys.txt was copied verbatim
6608d2d5
CBW
2102from L<http://www.unicode.org/Public/UCA/9.0.0/allkeys.txt>.
2103For this file, Copyright (c) 2016 Unicode, Inc.; distributed
fcd4e2f8 2104under the Terms of Use in L<http://www.unicode.org/terms_of_use.html>
45394607
JH
2105
2106=head1 SEE ALSO
2107
2108=over 4
2109
91ae00cb
NC
2110=item Unicode Collation Algorithm - UTS #10
2111
2112L<http://www.unicode.org/reports/tr10/>
2113
2114=item The Default Unicode Collation Element Table (DUCET)
2115
10d7ec48 2116L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 2117
91ae00cb 2118=item The conformance test for the UCA
45394607 2119
10d7ec48 2120L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 2121
10d7ec48 2122L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 2123
91ae00cb 2124=item Hangul Syllable Type
0116f5dc 2125
10d7ec48 2126L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 2127
91ae00cb 2128=item Unicode Normalization Forms - UAX #15
a7fbee98 2129
91ae00cb 2130L<http://www.unicode.org/reports/tr15/>
a7fbee98 2131
aa7758f7
CBW
2132=item Unicode Locale Data Markup Language (LDML) - UTS #35
2133
2134L<http://www.unicode.org/reports/tr35/>
2135
45394607
JH
2136=back
2137
2138=cut