This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[PATCH] Bump Locale-Codes from 3.32 to 3.33
[perl5.git] / cpan / Unicode-Collate / Collate.pm
CommitLineData
45394607
JH
1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c 6 }
d8e4b4ea
CBW
7 unless (0x41 == unpack('U', 'A')) {
8 die "Unicode::Collate cannot get a Unicode code point\n";
9 }
4a2e806c
JH
10}
11
45394607
JH
12use 5.006;
13use strict;
14use warnings;
15use Carp;
e69a2255 16use File::Spec;
5398038e 17
10d7ec48
NC
18no warnings 'utf8';
19
f8187d97 20our $VERSION = '1.07';
45394607
JH
21our $PACKAGE = __PACKAGE__;
22
19265284 23### begin XS only ###
f58b9ef1
CBW
24require DynaLoader;
25our @ISA = qw(DynaLoader);
26bootstrap Unicode::Collate $VERSION;
19265284 27### end XS only ###
f58b9ef1 28
e7f779c8
RGS
29my @Path = qw(Unicode Collate);
30my $KeyFile = "allkeys.txt";
45394607 31
4d36a948
TS
32# Perl's boolean
33use constant TRUE => 1;
34use constant FALSE => "";
35use constant NOMATCHPOS => -1;
36
37# A coderef to get combining class imported from Unicode::Normalize
38# (i.e. \&Unicode::Normalize::getCombinClass).
39# This is also used as a HAS_UNICODE_NORMALIZE flag.
e7f779c8 40my $CVgetCombinClass;
4d36a948 41
9f1f04a1
RGS
42# Supported Levels
43use constant MinLevel => 1;
44use constant MaxLevel => 4;
45
4d36a948 46# Minimum weights at level 2 and 3, respectively
9f1f04a1
RGS
47use constant Min2Wt => 0x20;
48use constant Min3Wt => 0x02;
4d36a948
TS
49
50# Shifted weight at 4th level
9f1f04a1 51use constant Shift4Wt => 0xFFFF;
4d36a948 52
4d36a948 53# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
4d36a948
TS
54use constant VCE_TEMPLATE => 'Cn4';
55
4d36a948 56# A sort key: 16-bit weights
4d36a948
TS
57use constant KEY_TEMPLATE => 'n*';
58
750da838
CBW
59# The tie-breaking: 32-bit weights
60use constant TIE_TEMPLATE => 'N*';
61
4d36a948
TS
62# Level separator in a sort key:
63# i.e. pack(KEY_TEMPLATE, 0)
64use constant LEVEL_SEP => "\0\0";
65
66# As Unicode code point separator for hash keys.
67# A joined code point string (denoted by JCPS below)
68# like "65;768" is used for internal processing
69# instead of Perl's Unicode string like "\x41\x{300}",
70# as the native code point is different from the Unicode code point
71# on EBCDIC platform.
72# This character must not be included in any stringified
73# representation of an integer.
74use constant CODE_SEP => ';';
19265284 75 # NOTE: in regex /;/ is used for $jcps!
4d36a948
TS
76
77# boolean values of variable weights
0116f5dc
JH
78use constant NON_VAR => 0; # Non-Variable character
79use constant VAR => 1; # Variable character
3164dd77 80
91ae00cb 81# specific code points
00e00351
CBW
82use constant Hangul_SIni => 0xAC00;
83use constant Hangul_SFin => 0xD7A3;
584e761d 84
4d36a948 85# Logical_Order_Exception in PropList.txt
e7f779c8 86my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 87
6d3c12b5
CBW
88# for highestFFFF and minimalFFFE
89my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
90my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE);
91
2b488730 92sub UCA_Version { "28" }
a7fbee98 93
2b488730 94sub Base_Unicode_Version { "6.3.0" }
a7fbee98 95
9f1f04a1
RGS
96######
97
9f1f04a1 98sub pack_U {
ae6aa562 99 return pack('U*', @_);
9f1f04a1
RGS
100}
101
d2309057
CBW
102### begin XS only ###
103*unpack_U = exists &Unicode::Collate::bootstrap &&
104 $] < 5.008 && \&unpackUfor56 && 0x41 == unpackUfor56('A')
105 ? \&unpackUfor56 : sub { return unpack('U*', shift(@_).pack('U*')) };
106### end XS only ###
4779e03e 107
9f1f04a1
RGS
108######
109
91ae00cb
NC
110my (%VariableOK);
111@VariableOK{ qw/
0116f5dc 112 blanked non-ignorable shifted shift-trimmed
91ae00cb 113 / } = (); # keys lowercased
0116f5dc
JH
114
115our @ChangeOK = qw/
116 alternate backwards level normalization rearrange
19265284 117 katakana_before_hiragana upper_before_lower ignore_level2
60f577e0 118 overrideCJK overrideHangul overrideOut preprocess UCA_Version
6d3c12b5 119 hangul_terminator variable identical highestFFFF minimalFFFE
f8187d97 120 long_contraction
0116f5dc
JH
121 /;
122
123our @ChangeNG = qw/
b5d9a953 124 entry mapping table maxlength contraction
19265284
CBW
125 ignoreChar ignoreName undefChar undefName rewrite
126 versionTable alternateTable backwardsTable forwardsTable
127 rearrangeTable variableTable
00e00351 128 derivCode normCode rearrangeHash backwardsFlag
aa7758f7 129 suppress suppressHash
19265284 130 __useXS /; ### XS only
ed2081ad
CBW
131# The hash key 'ignored' was deleted at v 0.21.
132# The hash key 'isShift' was deleted at v 0.23.
133# The hash key 'combining' was deleted at v 0.24.
134# The hash key 'entries' was deleted at v 0.30.
135# The hash key 'L3_ignorable' was deleted at v 0.40.
91ae00cb
NC
136
137sub version {
138 my $self = shift;
139 return $self->{versionTable} || 'unknown';
140}
0116f5dc
JH
141
142my (%ChangeOK, %ChangeNG);
143@ChangeOK{ @ChangeOK } = ();
144@ChangeNG{ @ChangeNG } = ();
145
146sub change {
147 my $self = shift;
148 my %hash = @_;
149 my %old;
750da838
CBW
150 if (exists $hash{alternate}) {
151 if (exists $hash{variable}) {
152 delete $hash{alternate};
153 } else {
154 $hash{variable} = $hash{alternate};
155 }
91ae00cb 156 }
0116f5dc
JH
157 foreach my $k (keys %hash) {
158 if (exists $ChangeOK{$k}) {
159 $old{$k} = $self->{$k};
160 $self->{$k} = $hash{$k};
750da838 161 } elsif (exists $ChangeNG{$k}) {
0116f5dc
JH
162 croak "change of $k via change() is not allowed!";
163 }
164 # else => ignored
165 }
3756e7ca 166 $self->checkCollator();
0116f5dc
JH
167 return wantarray ? %old : $self;
168}
a7fbee98 169
9f1f04a1
RGS
170sub _checkLevel {
171 my $level = shift;
abd1ec54
NC
172 my $key = shift; # 'level' or 'backwards'
173 MinLevel <= $level or croak sprintf
174 "Illegal level %d (in value for key '%s') lower than %d.",
175 $level, $key, MinLevel;
176 $level <= MaxLevel or croak sprintf
177 "Unsupported level %d (in value for key '%s') higher than %d.",
178 $level, $key, MaxLevel;
9f1f04a1
RGS
179}
180
91ae00cb
NC
181my %DerivCode = (
182 8 => \&_derivCE_8,
183 9 => \&_derivCE_9,
184 11 => \&_derivCE_9, # 11 == 9
3756e7ca 185 14 => \&_derivCE_14,
74b94a79
CBW
186 16 => \&_derivCE_14, # 16 == 14
187 18 => \&_derivCE_18,
00e00351 188 20 => \&_derivCE_20,
b5d9a953 189 22 => \&_derivCE_22,
cba8842c 190 24 => \&_derivCE_24,
750da838 191 26 => \&_derivCE_24, # 26 == 24
2b488730 192 28 => \&_derivCE_24, # 28 == 24
91ae00cb
NC
193);
194
0116f5dc
JH
195sub checkCollator {
196 my $self = shift;
9f1f04a1 197 _checkLevel($self->{level}, "level");
a7fbee98 198
91ae00cb
NC
199 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
200 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 201
91ae00cb 202 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 203 $self->{alternateTable} || 'shifted';
91ae00cb
NC
204 $self->{variable} = $self->{alternate} = lc($self->{variable});
205 exists $VariableOK{ $self->{variable} }
3756e7ca 206 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
0116f5dc 207
9f1f04a1
RGS
208 if (! defined $self->{backwards}) {
209 $self->{backwardsFlag} = 0;
750da838 210 } elsif (! ref $self->{backwards}) {
9f1f04a1
RGS
211 _checkLevel($self->{backwards}, "backwards");
212 $self->{backwardsFlag} = 1 << $self->{backwards};
750da838 213 } else {
9f1f04a1
RGS
214 my %level;
215 $self->{backwardsFlag} = 0;
216 for my $b (@{ $self->{backwards} }) {
217 _checkLevel($b, "backwards");
218 $level{$b} = 1;
219 }
220 for my $v (sort keys %level) {
221 $self->{backwardsFlag} += 1 << $v;
222 }
223 }
0116f5dc 224
91ae00cb
NC
225 defined $self->{rearrange} or $self->{rearrange} = [];
226 ref $self->{rearrange}
227 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc
JH
228
229 # keys of $self->{rearrangeHash} are $self->{rearrange}.
230 $self->{rearrangeHash} = undef;
231
232 if (@{ $self->{rearrange} }) {
233 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
234 }
235
236 $self->{normCode} = undef;
a7fbee98
JH
237
238 if (defined $self->{normalization}) {
239 eval { require Unicode::Normalize };
91ae00cb 240 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 241
91ae00cb 242 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 243
91ae00cb
NC
244 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
245 $self->{normCode} = \&Unicode::Normalize::NFD;
246 }
247 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f
RGS
248 my $norm = $self->{normalization};
249 $self->{normCode} = sub {
1d2654e1
JH
250 Unicode::Normalize::normalize($norm, shift);
251 };
06c8fc8f
RGS
252 eval { $self->{normCode}->("") }; # try
253 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 254 }
a7fbee98 255 }
0116f5dc
JH
256 return;
257}
258
259sub new
260{
261 my $class = shift;
262 my $self = bless { @_ }, $class;
45394607 263
19265284
CBW
264### begin XS only ###
265 if (! exists $self->{table} && !defined $self->{rewrite} &&
f58b9ef1
CBW
266 !defined $self->{undefName} && !defined $self->{ignoreName} &&
267 !defined $self->{undefChar} && !defined $self->{ignoreChar}) {
268 $self->{__useXS} = \&_fetch_simple;
19265284
CBW
269 } else {
270 $self->{__useXS} = undef;
271 }
272### end XS only ###
f58b9ef1 273
aa7758f7
CBW
274 # keys of $self->{suppressHash} are $self->{suppress}.
275 if ($self->{suppress} && @{ $self->{suppress} }) {
276 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
277 } # before read_table()
278
a7fbee98 279 # If undef is passed explicitly, no file is read.
0116f5dc 280 $self->{table} = $KeyFile if ! exists $self->{table};
3756e7ca 281 $self->read_table() if defined $self->{table};
905aa9f0 282
a7fbee98 283 if ($self->{entry}) {
e7f779c8 284 while ($self->{entry} =~ /([^\n]+)/g) {
bd65daab 285 $self->parseEntry($1, TRUE);
e7f779c8 286 }
a7fbee98 287 }
905aa9f0 288
f8187d97 289 # only in new(), not in change()
9f1f04a1 290 $self->{level} ||= MaxLevel;
0116f5dc 291 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 292
abd1ec54 293 $self->{overrideHangul} = FALSE
0116f5dc 294 if ! exists $self->{overrideHangul};
abd1ec54 295 $self->{overrideCJK} = FALSE
0116f5dc 296 if ! exists $self->{overrideCJK};
06c8fc8f 297 $self->{normalization} = 'NFD'
0116f5dc 298 if ! exists $self->{normalization};
3756e7ca
RGS
299 $self->{rearrange} = $self->{rearrangeTable} ||
300 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
a7fbee98 301 if ! exists $self->{rearrange};
0116f5dc
JH
302 $self->{backwards} = $self->{backwardsTable}
303 if ! exists $self->{backwards};
f8187d97
SH
304 exists $self->{long_contraction} or $self->{long_contraction}
305 = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
a7fbee98 306
f8187d97 307 # checkCollator() will be called in change()
3756e7ca 308 $self->checkCollator();
a7fbee98
JH
309
310 return $self;
311}
905aa9f0 312
00e00351
CBW
313sub parseAtmark {
314 my $self = shift;
315 my $line = shift; # after s/^\s*\@//
316
317 if ($line =~ /^version\s*(\S*)/) {
318 $self->{versionTable} ||= $1;
319 }
320 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
321 $self->{variableTable} ||= $1;
322 }
323 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
324 $self->{alternateTable} ||= $1;
325 }
326 elsif ($line =~ /^backwards\s+(\S*)/) {
327 push @{ $self->{backwardsTable} }, $1;
328 }
ed2081ad 329 elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
00e00351
CBW
330 push @{ $self->{forwardsTable} }, $1;
331 }
332 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
333 push @{ $self->{rearrangeTable} }, _getHexArray($1);
334 }
335}
336
905aa9f0 337sub read_table {
a7fbee98 338 my $self = shift;
a7fbee98 339
19265284 340### begin XS only ###
f58b9ef1
CBW
341 if ($self->{__useXS}) {
342 my @rest = _fetch_rest(); # complex matter need to parse
343 for my $line (@rest) {
344 next if $line =~ /^\s*#/;
345
346 if ($line =~ s/^\s*\@//) {
347 $self->parseAtmark($line);
348 } else {
349 $self->parseEntry($line);
350 }
351 }
352 return;
353 }
19265284 354### end XS only ###
f58b9ef1 355
e7f779c8
RGS
356 my($f, $fh);
357 foreach my $d (@INC) {
358 $f = File::Spec->catfile($d, @Path, $self->{table});
359 last if open($fh, $f);
360 $f = undef;
361 }
0d50d293
RGS
362 if (!defined $f) {
363 $f = File::Spec->catfile(@Path, $self->{table});
364 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
365 }
a7fbee98 366
6d24ed10
SP
367 while (my $line = <$fh>) {
368 next if $line =~ /^\s*#/;
abd1ec54 369
00e00351
CBW
370 if ($line =~ s/^\s*\@//) {
371 $self->parseAtmark($line);
372 } else {
373 $self->parseEntry($line);
abd1ec54 374 }
45394607 375 }
e7f779c8 376 close $fh;
45394607
JH
377}
378
905aa9f0 379
45394607
JH
380##
381## get $line, parse it, and write an entry in $self
382##
383sub parseEntry
384{
a7fbee98
JH
385 my $self = shift;
386 my $line = shift;
bd65daab 387 my $tailoring = shift;
4d36a948 388 my($name, $entry, @uv, @key);
a7fbee98 389
19265284
CBW
390 if (defined $self->{rewrite}) {
391 $line = $self->{rewrite}->($line);
392 }
393
a7fbee98
JH
394 return if $line !~ /^\s*[0-9A-Fa-f]/;
395
396 # removes comment and gets name
397 $name = $1
398 if $line =~ s/[#%]\s*(.*)//;
399 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
400
401 # gets element
402 my($e, $k) = split /;/, $line;
403 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
404 if ! $k;
405
4d36a948
TS
406 @uv = _getHexArray($e);
407 return if !@uv;
bd65daab 408 return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
aa7758f7 409 exists $self->{suppressHash}{$uv[0]};
4d36a948 410 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 411
4d36a948 412 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 413 my $ele = pack_U(@uv);
a7fbee98 414
ed2081ad 415 # regarded as if it were not stored in the table
4d36a948
TS
416 return
417 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 418
4d36a948
TS
419 # replaced as completely ignorable
420 $k = '[.0000.0000.0000.0000]'
421 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 422 }
0116f5dc 423
4d36a948
TS
424 # replaced as completely ignorable
425 $k = '[.0000.0000.0000.0000]'
426 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
427
4c843366 428 my $is_L3_ignorable = TRUE;
4d36a948 429
caffd4cf
TS
430 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
431 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948
TS
432 my @wt = _getHexArray($arr);
433 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 434 $is_L3_ignorable = FALSE
3756e7ca
RGS
435 if $wt[0] || $wt[1] || $wt[2];
436 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
437 # is completely ignorable.
4c843366
JH
438 # For expansion, an entry $is_L3_ignorable
439 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 440 }
caffd4cf 441
e7f779c8 442 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
caffd4cf 443
91ae00cb 444 if (@uv > 1) {
b5d9a953
CBW
445 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
446 $self->{maxlength}{$uv[0]} = @uv;
447 }
448 }
f8187d97
SH
449 while (@uv > 2) {
450 pop @uv;
451 my $fake_entry = join(CODE_SEP, @uv); # in JCPS
452 $self->{contraction}{$fake_entry} = 1;
91ae00cb 453 }
45394607
JH
454}
455
9f1f04a1 456
45394607
JH
457sub viewSortKey
458{
a7fbee98 459 my $self = shift;
750da838
CBW
460 my $str = shift;
461 $self->visualizeSortKey($self->getSortKey($str));
9f1f04a1 462}
0116f5dc 463
d16e9e3d 464
750da838
CBW
465sub process
466{
467 my $self = shift;
468 my $str = shift;
469 my $prep = $self->{preprocess};
470 my $norm = $self->{normCode};
471
472 $str = &$prep($str) if ref $prep;
473 $str = &$norm($str) if ref $norm;
474 return $str;
475}
476
45394607 477##
91ae00cb 478## arrayref of JCPS = splitEnt(string to be collated)
750da838 479## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
45394607 480##
91ae00cb 481sub splitEnt
45394607 482{
a7fbee98 483 my $self = shift;
750da838
CBW
484 my $str = shift;
485 my $wLen = shift; # with Length
4d36a948 486
91ae00cb 487 my $map = $self->{mapping};
a7fbee98
JH
488 my $max = $self->{maxlength};
489 my $reH = $self->{rearrangeHash};
b5d9a953
CBW
490 my $vers = $self->{UCA_Version};
491 my $ver9 = $vers >= 9 && $vers <= 11;
f8187d97 492 my $long = $self->{long_contraction};
19265284 493 my $uXS = $self->{__useXS}; ### XS only
a7fbee98 494
750da838 495 my @buf;
a7fbee98 496
4d36a948 497 # get array of Unicode code point of string.
9f1f04a1 498 my @src = unpack_U($str);
4d36a948
TS
499
500 # rearrangement:
501 # Character positions are not kept if rearranged,
502 # then neglected if $wLen is true.
503 if ($reH && ! $wLen) {
a7fbee98
JH
504 for (my $i = 0; $i < @src; $i++) {
505 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
506 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
507 $i++;
508 }
509 }
45394607 510 }
45394607 511
3756e7ca 512 # remove a code point marked as a completely ignorable.
abd1ec54 513 for (my $i = 0; $i < @src; $i++) {
60f577e0 514 if ($vers <= 20 && _isIllegal($src[$i])) {
b5d9a953
CBW
515 $src[$i] = undef;
516 } elsif ($ver9) {
6d3c12b5
CBW
517 $src[$i] = undef if $map->{ $src[$i] }
518 ? @{ $map->{ $src[$i] } } == 0
f443a335 519 : $uXS && _ignorable_simple($src[$i]); ### XS only
b5d9a953 520 }
0116f5dc
JH
521 }
522
a7fbee98 523 for (my $i = 0; $i < @src; $i++) {
91ae00cb 524 my $jcps = $src[$i];
3756e7ca
RGS
525
526 # skip removed code point
527 if (! defined $jcps) {
528 if ($wLen && @buf) {
529 $buf[-1][2] = $i + 1;
530 }
531 next;
532 }
533
abd1ec54 534 my $i_orig = $i;
4d36a948 535
3756e7ca
RGS
536 # find contraction
537 if ($max->{$jcps}) {
91ae00cb
NC
538 my $temp_jcps = $jcps;
539 my $jcpsLen = 1;
540 my $maxLen = $max->{$jcps};
4d36a948 541
91ae00cb 542 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 543 next if ! defined $src[$p];
91ae00cb
NC
544 $temp_jcps .= CODE_SEP . $src[$p];
545 $jcpsLen++;
546 if ($map->{$temp_jcps}) {
547 $jcps = $temp_jcps;
4d36a948
TS
548 $i = $p;
549 }
550 }
4d36a948 551
b5d9a953 552 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
06c8fc8f 553 # This process requires Unicode::Normalize.
91ae00cb 554 # If "normalization" is undef, here should be skipped *always*
06c8fc8f
RGS
555 # (in spite of bool value of $CVgetCombinClass),
556 # since canonical ordering cannot be expected.
557 # Blocked combining character should not be contracted.
558
06c8fc8f 559 # $self->{normCode} is false in the case of "prenormalized".
b5d9a953
CBW
560 if ($self->{normalization}) {
561 my $cont = $self->{contraction};
06c8fc8f 562 my $preCC = 0;
b5d9a953
CBW
563 my $preCC_uc = 0;
564 my $jcps_uc = $jcps;
565 my(@out, @out_uc);
06c8fc8f
RGS
566
567 for (my $p = $i + 1; $p < @src; $p++) {
568 next if ! defined $src[$p];
b5d9a953 569 my $curCC = $CVgetCombinClass->($src[$p]);
06c8fc8f
RGS
570 last unless $curCC;
571 my $tail = CODE_SEP . $src[$p];
b5d9a953 572
f8187d97
SH
573 if ($preCC != $curCC && $map->{$jcps.$tail}) {
574 $jcps .= $tail;
575 push @out, $p;
576 } else {
577 $preCC = $curCC;
578 }
579
580 next if !$long;
581
b5d9a953
CBW
582 if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
583 $cont->{$jcps_uc.$tail})) {
584 $jcps_uc .= $tail;
585 push @out_uc, $p;
586 } else {
587 $preCC_uc = $curCC;
588 }
4d36a948 589 }
b5d9a953 590
f8187d97 591 if (@out_uc && $map->{$jcps_uc}) {
b5d9a953
CBW
592 $jcps = $jcps_uc;
593 $src[$_] = undef for @out_uc;
594 } else {
595 $src[$_] = undef for @out;
596 }
a7fbee98 597 }
a7fbee98
JH
598 }
599
3756e7ca 600 # skip completely ignorable
6d3c12b5
CBW
601 if ($map->{$jcps} ? @{ $map->{$jcps} } == 0 :
602 $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
3756e7ca
RGS
603 if ($wLen && @buf) {
604 $buf[-1][2] = $i + 1;
a7fbee98 605 }
3756e7ca 606 next;
a7fbee98 607 }
4d36a948 608
91ae00cb 609 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 610 }
4d36a948 611 return \@buf;
d16e9e3d 612}
45394607 613
539ce3d8
CBW
614##
615## VCE = _pack_override(input, codepoint, derivCode)
616##
617sub _pack_override ($$$) {
618 my $r = shift;
619 my $u = shift;
620 my $der = shift;
621
622 if (ref $r) {
623 return pack(VCE_TEMPLATE, NON_VAR, @$r);
624 } elsif (defined $r) {
625 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
626 } else {
d8e4b4ea 627 $u = 0xFFFD if 0x10FFFF < $u;
539ce3d8
CBW
628 return $der->($u);
629 }
630}
d16e9e3d
JH
631
632##
abd1ec54 633## list of VCE = getWt(JCPS)
d16e9e3d
JH
634##
635sub getWt
636{
a7fbee98 637 my $self = shift;
91ae00cb
NC
638 my $u = shift;
639 my $map = $self->{mapping};
0116f5dc 640 my $der = $self->{derivCode};
60f577e0 641 my $out = $self->{overrideOut};
19265284 642 my $uXS = $self->{__useXS}; ### XS only
a7fbee98 643
91ae00cb 644 return if !defined $u;
6d3c12b5
CBW
645 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
646 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
60f577e0
SH
647 $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
648
649 my @ce;
650 if ($map->{$u}) {
651 @ce = @{ $map->{$u} }; # $u may be a contraction
19265284 652### begin XS only ###
60f577e0
SH
653 } elsif ($uXS && _exists_simple($u)) {
654 @ce = _fetch_simple($u);
19265284 655### end XS only ###
60f577e0 656 } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1 657 my $hang = $self->{overrideHangul};
1d2654e1 658 if ($hang) {
60f577e0 659 @ce = map _pack_override($_, $u, $der), $hang->($u);
539ce3d8 660 } elsif (!defined $hang) {
60f577e0 661 @ce = $der->($u);
539ce3d8 662 } else {
1d2654e1
JH
663 my $max = $self->{maxlength};
664 my @decH = _decompHangul($u);
665
666 if (@decH == 2) {
667 my $contract = join(CODE_SEP, @decH);
91ae00cb 668 @decH = ($contract) if $map->{$contract};
1d2654e1
JH
669 } else { # must be <@decH == 3>
670 if ($max->{$decH[0]}) {
671 my $contract = join(CODE_SEP, @decH);
91ae00cb 672 if ($map->{$contract}) {
1d2654e1
JH
673 @decH = ($contract);
674 } else {
675 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 676 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1
JH
677 }
678 # even if V's ignorable, LT contraction is not supported.
f58b9ef1 679 # If such a situation were required, NFD should be used.
1d2654e1
JH
680 }
681 if (@decH == 3 && $max->{$decH[1]}) {
682 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 683 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1
JH
684 }
685 }
686
60f577e0 687 @ce = map({
f58b9ef1 688 $map->{$_} ? @{ $map->{$_} } :
19265284 689 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
f58b9ef1 690 $der->($_);
1d2654e1
JH
691 } @decH);
692 }
60f577e0
SH
693 } elsif ($out && 0x10FFFF < $u) {
694 @ce = map _pack_override($_, $u, $der), $out->($u);
539ce3d8 695 } else {
584e761d
CBW
696 my $cjk = $self->{overrideCJK};
697 my $vers = $self->{UCA_Version};
698 if ($cjk && _isUIdeo($u, $vers)) {
60f577e0
SH
699 @ce = map _pack_override($_, $u, $der), $cjk->($u);
700 } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
701 @ce = _uideoCE_8($u);
702 } else {
703 @ce = $der->($u);
584e761d 704 }
a7fbee98 705 }
60f577e0 706 return map $self->varCE($_), @ce;
d16e9e3d
JH
707}
708
d16e9e3d
JH
709
710##
711## string sortkey = getSortKey(string arg)
712##
713sub getSortKey
714{
a7fbee98 715 my $self = shift;
750da838
CBW
716 my $orig = shift;
717 my $str = $self->process($orig);
718 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
00e00351 719 my $vers = $self->{UCA_Version};
aa7758f7 720 my $term = $self->{hangul_terminator};
750da838
CBW
721 my $lev = $self->{level};
722 my $iden = $self->{identical};
91ae00cb 723
abd1ec54 724 my @buf; # weight arrays
aa7758f7 725 if ($term) {
91ae00cb 726 my $preHST = '';
19265284 727 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
91ae00cb
NC
728 foreach my $jcps (@$rEnt) {
729 # weird things like VL, TL-contraction are not considered!
aa7758f7 730 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
91ae00cb
NC
731 if ($preHST && !$curHST || # hangul before non-hangul
732 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
733 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
734 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
aa7758f7 735 push @buf, $termCE;
91ae00cb
NC
736 }
737 $preHST = $curHST;
abd1ec54 738 push @buf, $self->getWt($jcps);
91ae00cb 739 }
aa7758f7
CBW
740 push @buf, $termCE if $preHST; # end at hangul
741 } else {
91ae00cb 742 foreach my $jcps (@$rEnt) {
abd1ec54 743 push @buf, $self->getWt($jcps);
91ae00cb
NC
744 }
745 }
746
750da838
CBW
747 my $rkey = $self->mk_SortKey(\@buf); ### XS only
748
749 if ($iden || $vers >= 26 && $lev == MaxLevel) {
750 $rkey .= LEVEL_SEP;
751 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
752 }
753 return $rkey;
45394607
JH
754}
755
756
757##
d16e9e3d 758## int compare = cmp(string a, string b)
45394607 759##
5398038e
TS
760sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
761sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
762sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
763sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
764sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
765sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
766sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607
JH
767
768##
d16e9e3d 769## list[strings] sorted = sort(list[strings] arg)
45394607 770##
a7fbee98
JH
771sub sort {
772 my $obj = shift;
773 return
774 map { $_->[1] }
775 sort{ $a->[0] cmp $b->[0] }
776 map [ $obj->getSortKey($_), $_ ], @_;
45394607
JH
777}
778
0116f5dc 779
4d36a948
TS
780##
781## bool _nonIgnorAtLevel(arrayref weights, int level)
782##
783sub _nonIgnorAtLevel($$)
784{
785 my $wt = shift;
786 return if ! defined $wt;
787 my $lv = shift;
9f1f04a1 788 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948
TS
789}
790
791##
792## bool _eqArray(
793## arrayref of arrayref[weights] source,
794## arrayref of arrayref[weights] substr,
795## int level)
796## * comparison of graphemes vs graphemes.
797## @$source >= @$substr must be true (check it before call this);
798##
799sub _eqArray($$$)
800{
801 my $source = shift;
802 my $substr = shift;
803 my $lev = shift;
804
805 for my $g (0..@$substr-1){
f58b9ef1 806 # Do the $g'th graphemes have the same number of AV weights?
4d36a948
TS
807 return if @{ $source->[$g] } != @{ $substr->[$g] };
808
809 for my $w (0..@{ $substr->[$g] }-1) {
810 for my $v (0..$lev-1) {
811 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
812 }
813 }
814 }
815 return 1;
816}
817
818##
819## (int position, int length)
17609435 820## int position = index(string, substring, position, [undoc'ed global])
4d36a948 821##
17609435 822## With "global" (only for the list context),
4d36a948
TS
823## returns list of arrayref[position, length].
824##
825sub index
826{
91ae00cb 827 my $self = shift;
750da838
CBW
828 $self->{preprocess} and
829 croak "Don't use Preprocess with index(), match(), etc.";
830 $self->{normCode} and
831 croak "Don't use Normalization with index(), match(), etc.";
832
91ae00cb
NC
833 my $str = shift;
834 my $len = length($str);
750da838
CBW
835 my $sub = shift;
836 my $subE = $self->splitEnt($sub);
91ae00cb
NC
837 my $pos = @_ ? shift : 0;
838 $pos = 0 if $pos < 0;
17609435 839 my $glob = shift;
91ae00cb
NC
840
841 my $lev = $self->{level};
3756e7ca
RGS
842 my $v2i = $self->{UCA_Version} >= 9 &&
843 $self->{variable} ne 'non-ignorable';
91ae00cb
NC
844
845 if (! @$subE) {
4d36a948 846 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
17609435 847 return $glob
4d36a948
TS
848 ? map([$_, 0], $temp..$len)
849 : wantarray ? ($temp,0) : $temp;
850 }
abd1ec54
NC
851 $len < $pos
852 and return wantarray ? () : NOMATCHPOS;
91ae00cb 853 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
abd1ec54
NC
854 @$strE
855 or return wantarray ? () : NOMATCHPOS;
856
4d36a948
TS
857 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
858
abd1ec54
NC
859 my $last_is_variable;
860 for my $vwt (map $self->getWt($_), @$subE) {
861 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
862 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 863
3756e7ca
RGS
864 # "Ignorable (L1, L2) after Variable" since track. v. 9
865 if ($v2i) {
abd1ec54
NC
866 if ($var) {
867 $last_is_variable = TRUE;
868 }
869 elsif (!$wt[0]) { # ignorable
4d36a948 870 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
871 }
872 else {
873 $last_is_variable = FALSE;
4d36a948
TS
874 }
875 }
876
abd1ec54
NC
877 if (@subWt && !$var && !$wt[0]) {
878 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
1393fe00 879 } elsif ($to_be_pushed) {
abd1ec54 880 push @subWt, [ \@wt ];
4d36a948 881 }
1393fe00 882 # else ===> skipped
4d36a948
TS
883 }
884
885 my $count = 0;
91ae00cb 886 my $end = @$strE - 1;
4d36a948 887
abd1ec54 888 $last_is_variable = FALSE; # reuse
4d36a948
TS
889 for (my $i = 0; $i <= $end; ) { # no $i++
890 my $found_base = 0;
891
892 # fetch a grapheme
893 while ($i <= $end && $found_base == 0) {
abd1ec54
NC
894 for my $vwt ($self->getWt($strE->[$i][0])) {
895 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
896 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 897
3756e7ca
RGS
898 # "Ignorable (L1, L2) after Variable" since track. v. 9
899 if ($v2i) {
abd1ec54
NC
900 if ($var) {
901 $last_is_variable = TRUE;
902 }
903 elsif (!$wt[0]) { # ignorable
4d36a948 904 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
905 }
906 else {
907 $last_is_variable = FALSE;
4d36a948
TS
908 }
909 }
910
abd1ec54
NC
911 if (@strWt && !$var && !$wt[0]) {
912 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
91ae00cb 913 $finPos[-1] = $strE->[$i][2];
4d36a948 914 } elsif ($to_be_pushed) {
abd1ec54 915 push @strWt, [ \@wt ];
91ae00cb 916 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 917 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 918 push @finPos, $strE->[$i][2];
4d36a948
TS
919 $found_base++;
920 }
921 # else ===> no-op
922 }
923 $i++;
924 }
925
926 # try to match
927 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
928 if ($iniPos[0] != NOMATCHPOS &&
929 $finPos[$#subWt] != NOMATCHPOS &&
930 _eqArray(\@strWt, \@subWt, $lev)) {
931 my $temp = $iniPos[0] + $pos;
932
17609435 933 if ($glob) {
4d36a948
TS
934 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
935 splice @strWt, 0, $#subWt;
936 splice @iniPos, 0, $#subWt;
937 splice @finPos, 0, $#subWt;
938 }
939 else {
940 return wantarray
941 ? ($temp, $finPos[$#subWt] - $iniPos[0])
942 : $temp;
943 }
944 }
945 shift @strWt;
946 shift @iniPos;
947 shift @finPos;
948 }
949 }
950
17609435 951 return $glob
4d36a948
TS
952 ? @g_ret
953 : wantarray ? () : NOMATCHPOS;
954}
955
956##
957## scalarref to matching part = match(string, substring)
958##
959sub match
960{
961 my $self = shift;
962 if (my($pos,$len) = $self->index($_[0], $_[1])) {
963 my $temp = substr($_[0], $pos, $len);
964 return wantarray ? $temp : \$temp;
965 # An lvalue ref \substr should be avoided,
966 # since its value is affected by modification of its referent.
967 }
968 else {
969 return;
970 }
971}
972
973##
974## arrayref matching parts = gmatch(string, substring)
975##
976sub gmatch
977{
978 my $self = shift;
979 my $str = shift;
980 my $sub = shift;
981 return map substr($str, $_->[0], $_->[1]),
982 $self->index($str, $sub, 0, 'g');
983}
984
985##
986## bool subst'ed = subst(string, substring, replace)
987##
988sub subst
989{
990 my $self = shift;
991 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
992
993 if (my($pos,$len) = $self->index($_[0], $_[1])) {
994 if ($code) {
995 my $mat = substr($_[0], $pos, $len);
996 substr($_[0], $pos, $len, $code->($mat));
997 } else {
998 substr($_[0], $pos, $len, $_[2]);
999 }
1000 return TRUE;
1001 }
1002 else {
1003 return FALSE;
1004 }
1005}
1006
1007##
1008## int count = gsubst(string, substring, replace)
1009##
1010sub gsubst
1011{
1012 my $self = shift;
1013 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1014 my $cnt = 0;
1015
1016 # Replacement is carried out from the end, then use reverse.
1017 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1018 if ($code) {
1019 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1020 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1021 } else {
1022 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1023 }
1024 $cnt++;
1025 }
1026 return $cnt;
1027}
1028
45394607
JH
10291;
1030__END__
1031
1032=head1 NAME
1033
a7fbee98 1034Unicode::Collate - Unicode Collation Algorithm
45394607
JH
1035
1036=head1 SYNOPSIS
1037
1038 use Unicode::Collate;
1039
1040 #construct
5398038e 1041 $Collator = Unicode::Collate->new(%tailoring);
45394607
JH
1042
1043 #sort
5398038e 1044 @sorted = $Collator->sort(@not_sorted);
45394607
JH
1045
1046 #compare
a7fbee98 1047 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1048
539ce3d8
CBW
1049B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1050according to Perl's Unicode support. See L<perlunicode>,
1051L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1052Otherwise you can use C<preprocess> or should decode them before.
91ae00cb 1053
45394607
JH
1054=head1 DESCRIPTION
1055
3756e7ca
RGS
1056This module is an implementation of Unicode Technical Standard #10
1057(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
4d36a948 1058
45394607
JH
1059=head2 Constructor and Tailoring
1060
539ce3d8
CBW
1061The C<new> method returns a collator object. If new() is called
1062with no parameters, the collator should do the default collation.
d16e9e3d 1063
5398038e 1064 $Collator = Unicode::Collate->new(
0116f5dc 1065 UCA_Version => $UCA_Version,
aa7758f7 1066 alternate => $alternate, # alias for 'variable'
45394607
JH
1067 backwards => $levelNumber, # or \@levelNumbers
1068 entry => $element,
91ae00cb 1069 hangul_terminator => $term_primary_weight,
6d3c12b5 1070 highestFFFF => $bool,
750da838 1071 identical => $bool,
45394607
JH
1072 ignoreName => qr/$ignoreName/,
1073 ignoreChar => qr/$ignoreChar/,
19265284 1074 ignore_level2 => $bool,
45394607
JH
1075 katakana_before_hiragana => $bool,
1076 level => $collationLevel,
f8187d97 1077 long_contraction => $bool,
6d3c12b5 1078 minimalFFFE => $bool,
91ae00cb 1079 normalization => $normalization_form,
45394607
JH
1080 overrideCJK => \&overrideCJK,
1081 overrideHangul => \&overrideHangul,
1082 preprocess => \&preprocess,
1083 rearrange => \@charList,
19265284 1084 rewrite => \&rewrite,
aa7758f7 1085 suppress => \@charList,
45394607
JH
1086 table => $filename,
1087 undefName => qr/$undefName/,
1088 undefChar => qr/$undefChar/,
1089 upper_before_lower => $bool,
91ae00cb 1090 variable => $variable,
45394607 1091 );
45394607
JH
1092
1093=over 4
1094
0116f5dc
JH
1095=item UCA_Version
1096
7b98b857
CBW
1097If the revision (previously "tracking version") number of UCA is given,
1098behavior of that revision is emulated on collating.
0116f5dc 1099If omitted, the return value of C<UCA_Version()> is used.
3756e7ca 1100
2b488730 1101The following revisions are supported. The default is 28.
3756e7ca 1102
0d50d293 1103 UCA Unicode Standard DUCET (@version)
b5d9a953 1104 -------------------------------------------------------
0d50d293
RGS
1105 8 3.1 3.0.1 (3.0.1d9)
1106 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1107 11 4.0 4.0.0 (4.0.0)
1108 14 4.1.0 4.1.0 (4.1.0)
74b94a79
CBW
1109 16 5.0 5.0.0 (5.0.0)
1110 18 5.1.0 5.1.0 (5.1.0)
00e00351 1111 20 5.2.0 5.2.0 (5.2.0)
b5d9a953 1112 22 6.0.0 6.0.0 (6.0.0)
cba8842c 1113 24 6.1.0 6.1.0 (6.1.0)
750da838 1114 26 6.2.0 6.2.0 (6.2.0)
2b488730 1115 28 6.3.0 6.3.0 (6.3.0)
3756e7ca 1116
f8187d97
SH
1117* See below C<long_contraction> with C<UCA_Version> 22 and 24.
1118
f58b9ef1 1119* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
b5d9a953
CBW
1120since C<UCA_Version> 22.
1121
60f577e0
SH
1122* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
1123and can be overridden since C<UCA_Version> 22.
1124
b5d9a953
CBW
1125* Fully ignorable characters were ignored, and would not interrupt
1126contractions with C<UCA_Version> 9 and 11.
1127
1128* Treatment of ignorables after variables and some behaviors
1129were changed at C<UCA_Version> 9.
1130
1131* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1132depend on C<UCA_Version>.
1133
1134* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1135C<hangul_terminator>.
1136
3756e7ca 1137=item alternate
0116f5dc 1138
3756e7ca 1139-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
0116f5dc 1140
3756e7ca
RGS
1141For backward compatibility, C<alternate> (old name) can be used
1142as an alias for C<variable>.
0116f5dc 1143
45394607
JH
1144=item backwards
1145
750da838 1146-- see 3.4 Backward Accents, UTS #10.
45394607
JH
1147
1148 backwards => $levelNumber or \@levelNumbers
1149
1150Weights in reverse order; ex. level 2 (diacritic ordering) in French.
68adb2b0
CBW
1151If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1152forwards at all the levels.
45394607
JH
1153
1154=item entry
1155
f8187d97 1156-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
45394607 1157
91ae00cb
NC
1158If the same character (or a sequence of characters) exists
1159in the collation element table through C<table>,
f58b9ef1 1160mapping to collation elements is overridden.
91ae00cb 1161If it does not exist, the mapping is defined additionally.
45394607 1162
abd1ec54
NC
1163 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11640063 0068 ; [.0E6A.0020.0002.0063] # ch
11650043 0068 ; [.0E6A.0020.0007.0043] # Ch
11660043 0048 ; [.0E6A.0020.0008.0043] # CH
1167006C 006C ; [.0F4C.0020.0002.006C] # ll
1168004C 006C ; [.0F4C.0020.0007.004C] # Ll
1169004C 004C ; [.0F4C.0020.0008.004C] # LL
e7f779c8
RGS
117000F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1171006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
117200D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1173004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
abd1ec54
NC
1174ENTRY
1175
1176 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
117700E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
117800C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1179ENTRY
45394607 1180
4d36a948 1181B<NOTE:> The code point in the UCA file format (before C<';'>)
abd1ec54
NC
1182B<must> be a Unicode code point (defined as hexadecimal),
1183but not a native code point.
4d36a948
TS
1184So C<0063> must always denote C<U+0063>,
1185but not a character of C<"\x63">.
1186
abd1ec54
NC
1187Weighting may vary depending on collation element table.
1188So ensure the weights defined in C<entry> will be consistent with
1189those in the collation element table loaded via C<table>.
1190
1191In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1192and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1193(as a value between C<0E60> and C<0E6D>)
1194makes ordering as C<C E<lt> CH E<lt> D>.
1195Exactly speaking DUCET already has some characters between C<C> and C<D>:
1196C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1197C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1198and C<c-curl> (C<U+0255>) with C<0E69>.
1199Then primary weight C<0E6A> for C<CH> makes C<CH>
1200ordered between C<c-curl> and C<D>.
1201
91ae00cb
NC
1202=item hangul_terminator
1203
3756e7ca 1204-- see 7.1.4 Trailing Weights, UTS #10.
91ae00cb
NC
1205
1206If a true value is given (non-zero but should be positive),
1207it will be added as a terminator primary weight to the end of
1208every standard Hangul syllable. Secondary and any higher weights
1209for terminator are set to zero.
1210If the value is false or C<hangul_terminator> key does not exist,
1211insertion of terminator weights will not be performed.
1212
1213Boundaries of Hangul syllables are determined
1214according to conjoining Jamo behavior in F<the Unicode Standard>
1215and F<HangulSyllableType.txt>.
1216
1217B<Implementation Note:>
1218(1) For expansion mapping (Unicode character mapped
1219to a sequence of collation elements), a terminator will not be added
1220between collation elements, even if Hangul syllable boundary exists there.
1221Addition of terminator is restricted to the next position
1222to the last collation element.
1223
1224(2) Non-conjoining Hangul letters
1225(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1226automatically terminated with a terminator primary weight.
1227These characters may need terminator included in a collation element
1228table beforehand.
1229
6d3c12b5
CBW
1230=item highestFFFF
1231
1232-- see 5.14 Collation Elements, UTS #35.
1233
1234If the parameter is made true, C<U+FFFF> has a highest primary weight.
1235When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1236C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1237begins with C<"abc">, or another primary equivalent.
1238C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1239such as C<"abc\x{FFFF}xyz">.
1240
1241C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
ed2081ad 1242almost, but the latter has a problem that you should know which letter is
6d3c12b5 1243next to C<c>. For a certain language where C<ch> as the next letter,
ed2081ad 1244C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
6d3c12b5 1245
60f577e0
SH
1246Note:
1247This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
f443a335 1248Any other character than C<U+FFFF> can be tailored by C<entry>.
6d3c12b5 1249
750da838
CBW
1250=item identical
1251
1252-- see A.3 Deterministic Comparison, UTS #10.
1253
1254By default, strings whose weights are equal should be equal,
1255even though their code points are not equal.
5c77070f 1256Completely ignorable characters are ignored.
750da838
CBW
1257
1258If the parameter is made true, a final, tie-breaking level is used.
5c77070f
CBW
1259If no difference of weights is found after the comparison through
1260all the level specified by C<level>, the comparison with code points
1261will be performed.
ed2081ad 1262For the tie-breaking comparison, the sort key has code points
5c77070f
CBW
1263of the original string appended.
1264Completely ignorable characters are not ignored.
750da838
CBW
1265
1266If C<preprocess> and/or C<normalization> is applied, the code points
1267of the string after them (in NFD by default) are used.
1268
45394607
JH
1269=item ignoreChar
1270
3756e7ca
RGS
1271=item ignoreName
1272
f8187d97 1273-- see 3.6 Variable Weighting, UTS #10.
45394607 1274
caffd4cf
TS
1275Makes the entry in the table completely ignorable;
1276i.e. as if the weights were zero at all level.
45394607 1277
3756e7ca
RGS
1278Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1279will be ignored. Through C<ignoreName>, any character whose name
1280(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1281will be ignored.
1282
a7fbee98 1283E.g. when 'a' and 'e' are ignorable,
45394607
JH
1284'element' is equal to 'lament' (or 'lmnt').
1285
19265284
CBW
1286=item ignore_level2
1287
1288-- see 5.1 Parametric Tailoring, UTS #10.
1289
1290By default, case-sensitive comparison (that is level 3 difference)
1291won't ignore accents (that is level 2 difference).
1292
1293If the parameter is made true, accents (and other primary ignorable
1294characters) are ignored, even though cases are taken into account.
1295
1296B<NOTE>: C<level> should be 3 or greater.
1297
3756e7ca
RGS
1298=item katakana_before_hiragana
1299
750da838 1300-- see 7.2 Tertiary Weight Table, UTS #10.
3756e7ca
RGS
1301
1302By default, hiragana is before katakana.
1303If the parameter is made true, this is reversed.
1304
1305B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1306distinctions must occur in level 3, and their weights at level 3 must be
1307same as those mentioned in 7.3.1, UTS #10.
1308If you define your collation elements which violate this requirement,
1309this parameter does not work validly.
1310
45394607
JH
1311=item level
1312
3756e7ca 1313-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1314
1315Set the maximum level.
1316Any higher levels than the specified one are ignored.
1317
1318 Level 1: alphabetic ordering
1319 Level 2: diacritic ordering
1320 Level 3: case ordering
91ae00cb 1321 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607
JH
1322
1323 ex.level => 2,
1324
a7fbee98
JH
1325If omitted, the maximum is the 4th.
1326
750da838
CBW
1327B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1328But this module only uses weights within 0xFFFF.
1329When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1330and 'shift-trimmed'), the level 4 may be unreliable.
1331
1332See also C<identical>.
1333
f8187d97
SH
1334=item long_contraction
1335
1336-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
1337
1338If the parameter is made true, for a contraction with three or more
1339characters (here nicknamed "long contraction"), initial substrings
1340will be handled.
1341For example, a contraction ABC, where A is a starter, and B and C
1342are non-starters (character with non-zero combining character class),
1343will be detected even if there is not AB as a contraction.
1344
1345B<Default:> Usually false.
1346If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
1347is not specified in C<new()>, a true value is set implicitly.
1348This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
1349
1350C<change()> handles C<long_contraction> explicitly only.
1351If C<long_contraction> is not specified in C<change()>, even though
1352C<UCA_Version> is changed, C<long_contraction> will not be changed.
1353
1354B<Limitation:> Scanning non-starters is one-way (no back tracking).
1355If AB is found but not ABC is not found, other long contraction where
1356the first character is A and the second is not B may not be found.
1357
1358Under C<(normalization =E<gt> undef)>, detection step of discontiguous
1359contractions are skipped.
1360
1361B<Note:> The following contractions in DUCET are not considered
1362in steps S2.1.1 to S2.1.3, where they are discontiguous.
1363
1364 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
1365 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
1366
1367For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
1368(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
1369In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
1370instead of C<0FB2 0F71 0F80>.
1371Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
1372contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
1373
6d3c12b5
CBW
1374=item minimalFFFE
1375
1376-- see 5.14 Collation Elements, UTS #35.
1377
1378If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1379The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1380first compares C<$a1> and C<$b1> at level 1, and
1381then C<$a2> and C<$b2> at level 1, as followed.
1382
1383 "ab\x{FFFE}a"
1384 "Ab\x{FFFE}a"
1385 "ab\x{FFFE}c"
1386 "Ab\x{FFFE}c"
1387 "ab\x{FFFE}xyz"
1388 "abc\x{FFFE}def"
1389 "abc\x{FFFE}xYz"
1390 "aBc\x{FFFE}xyz"
1391 "abcX\x{FFFE}def"
1392 "abcx\x{FFFE}xyz"
1393 "b\x{FFFE}aaa"
1394 "bbb\x{FFFE}a"
1395
60f577e0
SH
1396Note:
1397This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
f443a335 1398Any other character than C<U+FFFE> can be tailored by C<entry>.
6d3c12b5 1399
45394607
JH
1400=item normalization
1401
3756e7ca 1402-- see 4.1 Normalize, UTS #10.
45394607 1403
905aa9f0 1404If specified, strings are normalized before preparation of sort keys
45394607
JH
1405(the normalization is executed after preprocess).
1406
1d2654e1
JH
1407A form name C<Unicode::Normalize::normalize()> accepts will be applied
1408as C<$normalization_form>.
06c8fc8f 1409Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1
JH
1410See C<Unicode::Normalize::normalize()> for detail.
1411If omitted, C<'NFD'> is used.
45394607 1412
91ae00cb 1413C<normalization> is performed after C<preprocess> (if defined).
45394607 1414
06c8fc8f
RGS
1415Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1416though they are not concerned with C<Unicode::Normalize::normalize()>.
1417
1418If C<undef> (not a string C<"undef">) is passed explicitly
1419as the value for this key,
45394607 1420any normalization is not carried out (this may make tailoring easier
abd1ec54
NC
1421if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1422only contiguous contractions are resolved;
1423e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1424C<A-cedilla-ring> would be primary equal to C<A>.
06c8fc8f
RGS
1425In this point,
1426C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1427B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1428
1429In the case of C<(normalization =E<gt> "prenormalized")>,
1430any normalization is not performed, but
b5d9a953 1431discontiguous contractions with combining characters are performed.
06c8fc8f
RGS
1432Therefore
1433C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1434B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1435If source strings are finely prenormalized,
1436C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1437
1438Except C<(normalization =E<gt> undef)>,
1439B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607
JH
1440
1441=item overrideCJK
1442
4d36a948 1443-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1444
584e761d 1445By default, CJK unified ideographs are ordered in Unicode codepoint
ed2081ad 1446order, but those in the CJK Unified Ideographs block are less than
584e761d 1447those in the CJK Unified Ideographs Extension A etc.
00e00351 1448
b5d9a953 1449 In the CJK Unified Ideographs block:
cba8842c
A
1450 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1451 U+4E00..U+9FBB if UCA_Version is 14 or 16.
b5d9a953 1452 U+4E00..U+9FC3 if UCA_Version is 18.
cba8842c 1453 U+4E00..U+9FCB if UCA_Version is 20 or 22.
2b488730 1454 U+4E00..U+9FCC if UCA_Version is 24 or later.
00e00351 1455
b5d9a953
CBW
1456 In the CJK Unified Ideographs Extension blocks:
1457 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
2b488730
CBW
1458 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
1459 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
91ae00cb 1460
584e761d 1461Through C<overrideCJK>, ordering of CJK unified ideographs (including
f58b9ef1 1462extensions) can be overridden.
45394607 1463
584e761d 1464ex. CJK unified ideographs in the JIS code point order.
45394607
JH
1465
1466 overrideCJK => sub {
a7fbee98
JH
1467 my $u = shift; # get a Unicode codepoint
1468 my $b = pack('n', $u); # to UTF-16BE
1469 my $s = your_unicode_to_sjis_converter($b); # convert
1470 my $n = unpack('n', $s); # convert sjis to short
1471 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607
JH
1472 },
1473
539ce3d8
CBW
1474The return value may be an arrayref of 1st to 4th weights as shown
1475above. The return value may be an integer as the primary weight
1476as shown below. If C<undef> is returned, the default derived
1477collation element will be used.
1478
1479 overrideCJK => sub {
1480 my $u = shift; # get a Unicode codepoint
1481 my $b = pack('n', $u); # to UTF-16BE
1482 my $s = your_unicode_to_sjis_converter($b); # convert
1483 my $n = unpack('n', $s); # convert sjis to short
1484 return $n; # return the primary weight
1485 },
1486
1487The return value may be a list containing zero or more of
1488an arrayref, an integer, or C<undef>.
1489
584e761d 1490ex. ignores all CJK unified ideographs.
a7fbee98
JH
1491
1492 overrideCJK => sub {()}, # CODEREF returning empty list
1493
1494 # where ->eq("Pe\x{4E00}rl", "Perl") is true
584e761d 1495 # as U+4E00 is a CJK unified ideograph and to be ignorable.
a7fbee98 1496
60f577e0
SH
1497If a false value (including C<undef>) is passed, C<overrideCJK>
1498has no effect.
1499C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
1500
584e761d
CBW
1501But assignment of weight for CJK unified ideographs
1502in C<table> or C<entry> is still valid.
60f577e0
SH
1503If C<undef> is passed explicitly as the value for this key,
1504weights for CJK unified ideographs are treated as undefined.
1505However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
1506has no special meaning.
584e761d
CBW
1507
1508B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1509C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1510C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
f58b9ef1 1511ideographs. But they can't be overridden via C<overrideCJK> when you use
584e761d
CBW
1512DUCET, as the table includes weights for them. C<table> or C<entry> has
1513priority over C<overrideCJK>.
a7fbee98
JH
1514
1515=item overrideHangul
1516
4d36a948 1517-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1518
584e761d 1519By default, Hangul syllables are decomposed into Hangul Jamo,
abd1ec54 1520even if C<(normalization =E<gt> undef)>.
f58b9ef1 1521But the mapping of Hangul syllables may be overridden.
a7fbee98 1522
3756e7ca 1523This parameter works like C<overrideCJK>, so see there for examples.
a7fbee98 1524
584e761d
CBW
1525If you want to override the mapping of Hangul syllables,
1526NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1527Hangul syllables before overriding. FCD may decompose Hangul syllables
1528as the case may be.
45394607 1529
60f577e0
SH
1530If a false value (but not C<undef>) is passed, C<overrideHangul>
1531has no effect.
1532C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
1533
a7fbee98 1534If C<undef> is passed explicitly as the value for this key,
584e761d 1535weight for Hangul syllables is treated as undefined
a7fbee98 1536without decomposition into Hangul Jamo.
584e761d
CBW
1537But definition of weight for Hangul syllables
1538in C<table> or C<entry> is still valid.
a7fbee98 1539
60f577e0
SH
1540=item overrideOut
1541
1542-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
1543
1544Perl seems to allow out-of-range values (greater than 0x10FFFF).
1545By default, out-of-range values are replaced with C<U+FFFD>
1546(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
1547or ignored when C<UCA_Version> E<lt>= 20.
1548
1549When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
1550can be overridden. Though C<table> or C<entry> are available for them,
1551out-of-range values are too many.
1552
1553C<overrideOut> can perform it algorithmically.
1554This parameter works like C<overrideCJK>, so see there for examples.
1555
1556ex. ignores all out-of-range values.
1557
1558 overrideOut => sub {()}, # CODEREF returning empty list
1559
1560If a false value (including C<undef>) is passed, C<overrideOut>
1561has no effect.
1562C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
1563
d8e4b4ea
CBW
1564B<NOTE ABOUT U+FFFD:>
1565
60f577e0
SH
1566UCA recommends that out-of-range values should not be ignored for security
1567reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
1568However, C<U+FFFD> is wrongly mapped to a variable collation element
1569in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
1570ignored when C<variable> isn't C<Non-ignorable>.
1571
d8e4b4ea
CBW
1572The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
1573see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
1574(7.1.4 Trailing Weights). Such a correction is reproduced by this.
60f577e0
SH
1575
1576 overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
1577
d8e4b4ea 1578This workaround is unnecessary since Unicode 6.3.0.
60f577e0 1579
45394607
JH
1580=item preprocess
1581
750da838 1582-- see 5.4 Preprocessing, UTS #10.
45394607 1583
19265284 1584If specified, the coderef is used to preprocess each string
45394607
JH
1585before the formation of sort keys.
1586
a7fbee98 1587ex. dropping English articles, such as "a" or "the".
45394607
JH
1588Then, "the pen" is before "a pencil".
1589
1590 preprocess => sub {
1591 my $str = shift;
a7fbee98 1592 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1593 return $str;
45394607
JH
1594 },
1595
91ae00cb 1596C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1597
539ce3d8
CBW
1598ex. decoding strings in a legacy encoding such as shift-jis:
1599
1600 $sjis_collator = Unicode::Collate->new(
1601 preprocess => \&your_shiftjis_to_unicode_decoder,
1602 );
1603 @result = $sjis_collator->sort(@shiftjis_strings);
1604
1605B<Note:> Strings returned from the coderef will be interpreted
1606according to Perl's Unicode support. See L<perlunicode>,
1607L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1608
45394607
JH
1609=item rearrange
1610
750da838 1611-- see 3.5 Rearrangement, UTS #10.
45394607
JH
1612
1613Characters that are not coded in logical order and to be rearranged.
ed2081ad 1614If C<UCA_Version> is equal to or less than 11, default is:
45394607
JH
1615
1616 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1617
3756e7ca
RGS
1618If you want to disallow any rearrangement, pass C<undef> or C<[]>
1619(a reference to empty list) as the value for this key.
1620
74b94a79
CBW
1621If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1622(i.e. no rearrangement).
a7fbee98 1623
0116f5dc
JH
1624B<According to the version 9 of UCA, this parameter shall not be used;
1625but it is not warned at present.>
1626
19265284
CBW
1627=item rewrite
1628
1629If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1630The coderef will get each line, and then should return a rewritten line
1631according to the UCA file format.
1632If the coderef returns an empty line, the line will be skipped.
1633
1634e.g. any primary ignorable characters into tertiary ignorable:
1635
1636 rewrite => sub {
1637 my $line = shift;
1638 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1639 return $line;
1640 },
1641
1642This example shows rewriting weights. C<rewrite> is allowed to
1643affect code points, weights, and the name.
1644
1645B<NOTE>: C<table> is available to use another table file;
1646preparing a modified table once would be more efficient than
1647rewriting lines on reading an unmodified table every time.
1648
aa7758f7
CBW
1649=item suppress
1650
1651-- see suppress contractions in 5.14.11 Special-Purpose Commands,
1652UTS #35 (LDML).
1653
539ce3d8 1654Contractions beginning with the specified characters are suppressed,
bd65daab 1655even if those contractions are defined in C<table>.
aa7758f7
CBW
1656
1657An example for Russian and some languages using the Cyrillic script:
1658
1659 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1660
1661where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1662
bd65daab
CBW
1663B<NOTE>: Contractions via C<entry> are not be suppressed.
1664
45394607
JH
1665=item table
1666
f8187d97 1667-- see 3.8 Default Unicode Collation Element Table, UTS #10.
45394607 1668
91ae00cb 1669You can use another collation element table if desired.
45394607 1670
e7f779c8 1671The table file should locate in the F<Unicode/Collate> directory
3756e7ca
RGS
1672on C<@INC>. Say, if the filename is F<Foo.txt>,
1673the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
e7f779c8
RGS
1674
1675By default, F<allkeys.txt> (as the filename of DUCET) is used.
0d50d293
RGS
1676If you will prepare your own table file, any name other than F<allkeys.txt>
1677may be better to avoid namespace conflict.
45394607 1678
00e00351
CBW
1679B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1680module, and it may save time at the run time.
60f577e0 1681Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
19265284
CBW
1682or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1683C<rewrite> will prevent this module from using the compiled DUCET.
00e00351 1684
a7fbee98 1685If C<undef> is passed explicitly as the value for this key,
91ae00cb 1686no file is read (but you can define collation elements via C<entry>).
a7fbee98
JH
1687
1688A typical way to define a collation element table
1689without any file of table:
1690
1691 $onlyABC = Unicode::Collate->new(
1692 table => undef,
1693 entry => << 'ENTRIES',
16940061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
16950041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
16960062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
16970042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
16980063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
16990043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1700ENTRIES
1701 );
905aa9f0 1702
3756e7ca
RGS
1703If C<ignoreName> or C<undefName> is used, character names should be
1704specified as a comment (following C<#>) on each line.
45394607
JH
1705
1706=item undefChar
1707
3756e7ca
RGS
1708=item undefName
1709
4d36a948 1710-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1711
584e761d 1712Undefines the collation element as if it were unassigned in the C<table>.
45394607
JH
1713This reduces the size of the table.
1714If an unassigned character appears in the string to be collated,
1715the sort key is made from its codepoint
1716as a single-character collation element,
1717as it is greater than any other assigned collation elements
1718(in the codepoint order among the unassigned characters).
1719But, it'd be better to ignore characters
1720unfamiliar to you and maybe never used.
1721
3756e7ca
RGS
1722Through C<undefChar>, any character matching C<qr/$undefChar/>
1723will be undefined. Through C<undefName>, any character whose name
1724(given in the C<table> file as a comment) matches C<qr/$undefName/>
1725will be undefined.
1726
e7f779c8
RGS
1727ex. Collation weights for beyond-BMP characters are not stored in object:
1728
1729 undefChar => qr/[^\0-\x{fffd}]/,
1730
45394607
JH
1731=item upper_before_lower
1732
3756e7ca 1733-- see 6.6 Case Comparisons, UTS #10.
45394607 1734
3756e7ca
RGS
1735By default, lowercase is before uppercase.
1736If the parameter is made true, this is reversed.
45394607 1737
3756e7ca
RGS
1738B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1739distinctions must occur in level 3, and their weights at level 3 must be
1740same as those mentioned in 7.3.1, UTS #10.
1741If you define your collation elements which differs from this requirement,
1742this parameter doesn't work validly.
45394607 1743
91ae00cb
NC
1744=item variable
1745
f8187d97 1746-- see 3.6 Variable Weighting, UTS #10.
91ae00cb 1747
bd65daab 1748This key allows for variable weighting of variable collation elements,
91ae00cb 1749which are marked with an ASTERISK in the table
f58b9ef1 1750(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
91ae00cb
NC
1751
1752 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1753
1754These names are case-insensitive.
1755By default (if specification is omitted), 'shifted' is adopted.
1756
1757 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1758 considered at the 4th level.
1759
abd1ec54 1760 'Non-Ignorable' Variable elements are not reset to ignorable.
91ae00cb
NC
1761
1762 'Shifted' Variable elements are made ignorable at levels 1 through 3
1763 their level 4 weight is replaced by the old level 1 weight.
1764 Level 4 weight for Non-Variable elements is 0xFFFF.
1765
1766 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1767 are trimmed.
1768
45394607
JH
1769=back
1770
3164dd77 1771=head2 Methods for Collation
45394607
JH
1772
1773=over 4
1774
5398038e 1775=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607
JH
1776
1777Sorts a list of strings.
1778
5398038e 1779=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607
JH
1780
1781Returns 1 (when C<$a> is greater than C<$b>)
1782or 0 (when C<$a> is equal to C<$b>)
ed2081ad 1783or -1 (when C<$a> is less than C<$b>).
45394607 1784
5398038e
TS
1785=item C<$result = $Collator-E<gt>eq($a, $b)>
1786
1787=item C<$result = $Collator-E<gt>ne($a, $b)>
1788
1789=item C<$result = $Collator-E<gt>lt($a, $b)>
1790
1791=item C<$result = $Collator-E<gt>le($a, $b)>
1792
1793=item C<$result = $Collator-E<gt>gt($a, $b)>
1794
1795=item C<$result = $Collator-E<gt>ge($a, $b)>
1796
a7fbee98 1797They works like the same name operators as theirs.
5398038e
TS
1798
1799 eq : whether $a is equal to $b.
1800 ne : whether $a is not equal to $b.
ed2081ad
CBW
1801 lt : whether $a is less than $b.
1802 le : whether $a is less than $b or equal to $b.
5398038e
TS
1803 gt : whether $a is greater than $b.
1804 ge : whether $a is greater than $b or equal to $b.
1805
1806=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1807
3756e7ca 1808-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1809
1810Returns a sort key.
1811
1812You compare the sort keys using a binary comparison
1813and get the result of the comparison of the strings using UCA.
1814
5398038e 1815 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607
JH
1816
1817 is equivalent to
1818
5398038e 1819 $Collator->cmp($a, $b)
45394607 1820
a7fbee98
JH
1821=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1822
3756e7ca
RGS
1823Converts a sorting key into its representation form.
1824If C<UCA_Version> is 8, the output is slightly different.
1825
a7fbee98
JH
1826 use Unicode::Collate;
1827 my $c = Unicode::Collate->new();
1828 print $c->viewSortKey("Perl"),"\n";
1829
0116f5dc
JH
1830 # output:
1831 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1832 # Level 1 Level 2 Level 3 Level 4
1833
4d36a948
TS
1834=back
1835
1836=head2 Methods for Searching
d16e9e3d 1837
4d36a948
TS
1838The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1839like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1840but they are not aware of any pattern, but only a literal substring.
1841
19265284
CBW
1842B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1843for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1844C<subst>, C<gsubst>) is croaked, as the position and the length might
1845differ from those on the specified string.
1846
1847C<rearrange> and C<hangul_terminator> parameters are neglected.
1848C<katakana_before_hiragana> and C<upper_before_lower> don't affect
ed2081ad 1849matching and searching, as it doesn't matter whether greater or less.
19265284 1850
4d36a948
TS
1851=over 4
1852
1853=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1854
1855=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d
JH
1856
1857If C<$substring> matches a part of C<$string>, returns
1858the position of the first occurrence of the matching part in scalar context;
1859in list context, returns a two-element list of
1860the position and the length of the matching part.
1861
d16e9e3d
JH
1862If C<$substring> does not match any part of C<$string>,
1863returns C<-1> in scalar context and
1864an empty list in list context.
1865
8c1c815e
CBW
1866e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
1867you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
d16e9e3d 1868
5398038e 1869 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948 1870 # (normalization => undef) is REQUIRED.
d16e9e3d 1871 my $match;
a7fbee98 1872 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1873 $match = substr($str, $pos, $len);
d16e9e3d
JH
1874 }
1875
8c1c815e
CBW
1876and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
1877is primary equal to C<"M>E<uuml>C<SS">.
4d36a948
TS
1878
1879=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1880
1881=item C<($match) = $Collator-E<gt>match($string, $substring)>
1882
1883If C<$substring> matches a part of C<$string>, in scalar context, returns
1884B<a reference to> the first occurrence of the matching part
1885(C<$match_ref> is always true if matches,
1886since every reference is B<true>);
1887in list context, returns the first occurrence of the matching part.
1888
1889If C<$substring> does not match any part of C<$string>,
1890returns C<undef> in scalar context and
1891an empty list in list context.
1892
1893e.g.
1894
1895 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1896 print "matches [$$match_ref].\n";
1897 } else {
1898 print "doesn't match.\n";
1899 }
1900
3756e7ca 1901 or
4d36a948
TS
1902
1903 if (($match) = $Collator->match($str, $sub)) { # list context
1904 print "matches [$match].\n";
1905 } else {
1906 print "doesn't match.\n";
1907 }
1908
1909=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1910
1911If C<$substring> matches a part of C<$string>, returns
1912all the matching parts (or matching count in scalar context).
1913
1914If C<$substring> does not match any part of C<$string>,
1915returns an empty list.
1916
1917=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1918
1919If C<$substring> matches a part of C<$string>,
1920the first occurrence of the matching part is replaced by C<$replacement>
19265284 1921(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
4d36a948
TS
1922
1923C<$replacement> can be a C<CODEREF>,
1924taking the matching part as an argument,
1925and returning a string to replace the matching part
1926(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1927
1928=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1929
1930If C<$substring> matches a part of C<$string>,
19265284
CBW
1931all the occurrences of the matching part are replaced by C<$replacement>
1932(C<$string> is modified) and C<$count> is returned.
4d36a948
TS
1933
1934C<$replacement> can be a C<CODEREF>,
1935taking the matching part as an argument,
1936and returning a string to replace the matching part
1937(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1938
1939e.g.
1940
1941 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1942 # (normalization => undef) is REQUIRED.
19265284 1943 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
4d36a948
TS
1944 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1945
19265284 1946 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
4d36a948 1947 # i.e., all the camels are made bold-faced.
d16e9e3d 1948
19265284
CBW
1949 Examples: levels and ignore_level2 - what does camel match?
1950 ---------------------------------------------------------------------------
1951 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l
1952 -----------------------|---------------------------------------------------
1953 1 false | yes yes yes yes yes
1954 2 false | yes yes no yes yes
1955 3 false | yes no no yes yes
1956 4 false | yes no no no yes
1957 -----------------------|---------------------------------------------------
1958 1 true | yes yes yes yes yes
1959 2 true | yes yes yes yes yes
1960 3 true | yes no yes yes yes
1961 4 true | yes no yes no yes
1962 ---------------------------------------------------------------------------
1963 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1964 at any level.
1965
45394607
JH
1966=back
1967
3164dd77
TS
1968=head2 Other Methods
1969
1970=over 4
1971
0116f5dc
JH
1972=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1973
68adb2b0
CBW
1974=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1975
19265284 1976Changes the value of specified keys and returns the changed part.
0116f5dc
JH
1977
1978 $Collator = Unicode::Collate->new(level => 4);
1979
1980 $Collator->eq("perl", "PERL"); # false
1981
1982 %old = $Collator->change(level => 2); # returns (level => 4).
1983
1984 $Collator->eq("perl", "PERL"); # true
1985
1986 $Collator->change(%old); # returns (level => 2).
1987
1988 $Collator->eq("perl", "PERL"); # false
1989
1990Not all C<(key,value)>s are allowed to be changed.
1991See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1992
1993In the scalar context, returns the modified collator
1994(but it is B<not> a clone from the original).
1995
1996 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1997
1998 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1999
2000 $Collator->change(level => 4)->eq("perl", "PERL"); # false
2001
91ae00cb 2002=item C<$version = $Collator-E<gt>version()>
3164dd77 2003
91ae00cb
NC
2004Returns the version number (a string) of the Unicode Standard
2005which the C<table> file used by the collator object is based on.
2006If the table does not include a version line (starting with C<@version>),
2007returns C<"unknown">.
2008
2009=item C<UCA_Version()>
3164dd77 2010
7b98b857
CBW
2011Returns the revision number of UTS #10 this module consults,
2012that should correspond with the DUCET incorporated.
3164dd77 2013
91ae00cb
NC
2014=item C<Base_Unicode_Version()>
2015
7b98b857
CBW
2016Returns the version number of UTS #10 this module consults,
2017that should correspond with the DUCET incorporated.
3164dd77
TS
2018
2019=back
2020
3756e7ca
RGS
2021=head1 EXPORT
2022
2023No method will be exported.
45394607 2024
0d50d293
RGS
2025=head1 INSTALL
2026
2027Though this module can be used without any C<table> file,
2028to use this module easily, it is recommended to install a table file
2029in the UCA format, by copying it under the directory
2030<a place in @INC>/Unicode/Collate.
2031
6d24ed10
SP
2032The most preferable one is "The Default Unicode Collation Element Table"
2033(aka DUCET), available from the Unicode Consortium's website:
0d50d293
RGS
2034
2035 http://www.unicode.org/Public/UCA/
2036
2037 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
2038
2039If DUCET is not installed, it is recommended to copy the file
2040from http://www.unicode.org/Public/UCA/latest/allkeys.txt
2041to <a place in @INC>/Unicode/Collate/allkeys.txt
2042manually.
2043
3756e7ca
RGS
2044=head1 CAVEATS
2045
2046=over 4
45394607 2047
3756e7ca 2048=item Normalization
45394607 2049
3756e7ca
RGS
2050Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2051module (see L<Unicode::Normalize>).
45394607 2052
5398038e 2053If you need not it (say, in the case when you need not
45394607 2054handle any combining characters),
60f577e0 2055assign C<(normalization =E<gt> undef)> explicitly.
45394607 2056
4d36a948 2057-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 2058
3756e7ca 2059=item Conformance Test
0116f5dc 2060
10d7ec48
NC
2061The Conformance Test for the UCA is available
2062under L<http://www.unicode.org/Public/UCA/>.
0116f5dc
JH
2063
2064For F<CollationTest_SHIFTED.txt>,
2065a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2066for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 2067C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 2068
750da838
CBW
2069If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
2070C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
2071C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
2072C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
2073
4d36a948 2074B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 2075
3756e7ca
RGS
2076=back
2077
0d50d293 2078=head1 AUTHOR, COPYRIGHT AND LICENSE
45394607 2079
0d50d293 2080The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
f8187d97 2081<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2014,
0d50d293 2082SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 2083
0d50d293
RGS
2084This module is free software; you can redistribute it and/or
2085modify it under the same terms as Perl itself.
45394607 2086
00e00351 2087The file Unicode/Collate/allkeys.txt was copied verbatim
8c1c815e 2088from L<http://www.unicode.org/Public/UCA/6.3.0/allkeys.txt>.
5c77070f 2089For this file, Copyright (c) 2001-2012 Unicode, Inc.
6d24ed10 2090Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
45394607
JH
2091
2092=head1 SEE ALSO
2093
2094=over 4
2095
91ae00cb
NC
2096=item Unicode Collation Algorithm - UTS #10
2097
2098L<http://www.unicode.org/reports/tr10/>
2099
2100=item The Default Unicode Collation Element Table (DUCET)
2101
10d7ec48 2102L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 2103
91ae00cb 2104=item The conformance test for the UCA
45394607 2105
10d7ec48 2106L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 2107
10d7ec48 2108L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 2109
91ae00cb 2110=item Hangul Syllable Type
0116f5dc 2111
10d7ec48 2112L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 2113
91ae00cb 2114=item Unicode Normalization Forms - UAX #15
a7fbee98 2115
91ae00cb 2116L<http://www.unicode.org/reports/tr15/>
a7fbee98 2117
aa7758f7
CBW
2118=item Unicode Locale Data Markup Language (LDML) - UTS #35
2119
2120L<http://www.unicode.org/reports/tr35/>
2121
45394607
JH
2122=back
2123
2124=cut