This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Updated Unicode-Collate to CPAN version 0.76
[perl5.git] / cpan / Unicode-Collate / Collate.pm
CommitLineData
45394607
JH
1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c
JH
6 }
7}
8
45394607
JH
9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
10d7ec48
NC
15no warnings 'utf8';
16
19265284 17our $VERSION = '0.76';
45394607
JH
18our $PACKAGE = __PACKAGE__;
19
19265284 20### begin XS only ###
f58b9ef1
CBW
21require DynaLoader;
22our @ISA = qw(DynaLoader);
23bootstrap Unicode::Collate $VERSION;
19265284 24### end XS only ###
f58b9ef1 25
e7f779c8
RGS
26my @Path = qw(Unicode Collate);
27my $KeyFile = "allkeys.txt";
45394607 28
4d36a948
ST
29# Perl's boolean
30use constant TRUE => 1;
31use constant FALSE => "";
32use constant NOMATCHPOS => -1;
33
34# A coderef to get combining class imported from Unicode::Normalize
35# (i.e. \&Unicode::Normalize::getCombinClass).
36# This is also used as a HAS_UNICODE_NORMALIZE flag.
e7f779c8 37my $CVgetCombinClass;
4d36a948 38
9f1f04a1
RGS
39# Supported Levels
40use constant MinLevel => 1;
41use constant MaxLevel => 4;
42
4d36a948 43# Minimum weights at level 2 and 3, respectively
9f1f04a1
RGS
44use constant Min2Wt => 0x20;
45use constant Min3Wt => 0x02;
4d36a948
ST
46
47# Shifted weight at 4th level
9f1f04a1 48use constant Shift4Wt => 0xFFFF;
4d36a948 49
4d36a948
ST
50# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
51# PROBLEM: The Default Unicode Collation Element Table
52# has weights over 0xFFFF at the 4th level.
53# The tie-breaking in the variable weights
54# other than "shift" (as well as "shift-trimmed") is unreliable.
55use constant VCE_TEMPLATE => 'Cn4';
56
4d36a948
ST
57# A sort key: 16-bit weights
58# See also the PROBLEM on VCE_TEMPLATE above.
59use constant KEY_TEMPLATE => 'n*';
60
61# Level separator in a sort key:
62# i.e. pack(KEY_TEMPLATE, 0)
63use constant LEVEL_SEP => "\0\0";
64
65# As Unicode code point separator for hash keys.
66# A joined code point string (denoted by JCPS below)
67# like "65;768" is used for internal processing
68# instead of Perl's Unicode string like "\x41\x{300}",
69# as the native code point is different from the Unicode code point
70# on EBCDIC platform.
71# This character must not be included in any stringified
72# representation of an integer.
73use constant CODE_SEP => ';';
19265284 74 # NOTE: in regex /;/ is used for $jcps!
4d36a948
ST
75
76# boolean values of variable weights
0116f5dc
JH
77use constant NON_VAR => 0; # Non-Variable character
78use constant VAR => 1; # Variable character
3164dd77 79
91ae00cb 80# specific code points
00e00351
CBW
81use constant Hangul_SIni => 0xAC00;
82use constant Hangul_SFin => 0xD7A3;
584e761d 83
4d36a948 84# Logical_Order_Exception in PropList.txt
e7f779c8 85my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 86
7b98b857 87sub UCA_Version { "22" }
a7fbee98 88
7b98b857 89sub Base_Unicode_Version { "6.0.0" }
a7fbee98 90
9f1f04a1
RGS
91######
92
9f1f04a1 93sub pack_U {
ae6aa562 94 return pack('U*', @_);
9f1f04a1
RGS
95}
96
9f1f04a1
RGS
97######
98
91ae00cb
NC
99my (%VariableOK);
100@VariableOK{ qw/
0116f5dc 101 blanked non-ignorable shifted shift-trimmed
91ae00cb 102 / } = (); # keys lowercased
0116f5dc
JH
103
104our @ChangeOK = qw/
105 alternate backwards level normalization rearrange
19265284 106 katakana_before_hiragana upper_before_lower ignore_level2
0116f5dc 107 overrideHangul overrideCJK preprocess UCA_Version
91ae00cb 108 hangul_terminator variable
0116f5dc
JH
109 /;
110
111our @ChangeNG = qw/
b5d9a953 112 entry mapping table maxlength contraction
19265284
CBW
113 ignoreChar ignoreName undefChar undefName rewrite
114 versionTable alternateTable backwardsTable forwardsTable
115 rearrangeTable variableTable
00e00351 116 derivCode normCode rearrangeHash backwardsFlag
aa7758f7 117 suppress suppressHash
19265284 118 __useXS /; ### XS only
9f1f04a1
RGS
119# The hash key 'ignored' is deleted at v 0.21.
120# The hash key 'isShift' is deleted at v 0.23.
121# The hash key 'combining' is deleted at v 0.24.
91ae00cb 122# The hash key 'entries' is deleted at v 0.30.
e7f779c8 123# The hash key 'L3_ignorable' is deleted at v 0.40.
91ae00cb
NC
124
125sub version {
126 my $self = shift;
127 return $self->{versionTable} || 'unknown';
128}
0116f5dc
JH
129
130my (%ChangeOK, %ChangeNG);
131@ChangeOK{ @ChangeOK } = ();
132@ChangeNG{ @ChangeNG } = ();
133
134sub change {
135 my $self = shift;
136 my %hash = @_;
137 my %old;
91ae00cb
NC
138 if (exists $hash{variable} && exists $hash{alternate}) {
139 delete $hash{alternate};
140 }
141 elsif (!exists $hash{variable} && exists $hash{alternate}) {
142 $hash{variable} = $hash{alternate};
143 }
0116f5dc
JH
144 foreach my $k (keys %hash) {
145 if (exists $ChangeOK{$k}) {
146 $old{$k} = $self->{$k};
147 $self->{$k} = $hash{$k};
148 }
149 elsif (exists $ChangeNG{$k}) {
150 croak "change of $k via change() is not allowed!";
151 }
152 # else => ignored
153 }
3756e7ca 154 $self->checkCollator();
0116f5dc
JH
155 return wantarray ? %old : $self;
156}
a7fbee98 157
9f1f04a1
RGS
158sub _checkLevel {
159 my $level = shift;
abd1ec54
NC
160 my $key = shift; # 'level' or 'backwards'
161 MinLevel <= $level or croak sprintf
162 "Illegal level %d (in value for key '%s') lower than %d.",
163 $level, $key, MinLevel;
164 $level <= MaxLevel or croak sprintf
165 "Unsupported level %d (in value for key '%s') higher than %d.",
166 $level, $key, MaxLevel;
9f1f04a1
RGS
167}
168
91ae00cb
NC
169my %DerivCode = (
170 8 => \&_derivCE_8,
171 9 => \&_derivCE_9,
172 11 => \&_derivCE_9, # 11 == 9
3756e7ca 173 14 => \&_derivCE_14,
74b94a79
CBW
174 16 => \&_derivCE_14, # 16 == 14
175 18 => \&_derivCE_18,
00e00351 176 20 => \&_derivCE_20,
b5d9a953 177 22 => \&_derivCE_22,
91ae00cb
NC
178);
179
0116f5dc
JH
180sub checkCollator {
181 my $self = shift;
9f1f04a1 182 _checkLevel($self->{level}, "level");
a7fbee98 183
91ae00cb
NC
184 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
185 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 186
91ae00cb 187 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 188 $self->{alternateTable} || 'shifted';
91ae00cb
NC
189 $self->{variable} = $self->{alternate} = lc($self->{variable});
190 exists $VariableOK{ $self->{variable} }
3756e7ca 191 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
0116f5dc 192
9f1f04a1
RGS
193 if (! defined $self->{backwards}) {
194 $self->{backwardsFlag} = 0;
195 }
196 elsif (! ref $self->{backwards}) {
197 _checkLevel($self->{backwards}, "backwards");
198 $self->{backwardsFlag} = 1 << $self->{backwards};
199 }
200 else {
201 my %level;
202 $self->{backwardsFlag} = 0;
203 for my $b (@{ $self->{backwards} }) {
204 _checkLevel($b, "backwards");
205 $level{$b} = 1;
206 }
207 for my $v (sort keys %level) {
208 $self->{backwardsFlag} += 1 << $v;
209 }
210 }
0116f5dc 211
91ae00cb
NC
212 defined $self->{rearrange} or $self->{rearrange} = [];
213 ref $self->{rearrange}
214 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc
JH
215
216 # keys of $self->{rearrangeHash} are $self->{rearrange}.
217 $self->{rearrangeHash} = undef;
218
219 if (@{ $self->{rearrange} }) {
220 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
221 }
222
223 $self->{normCode} = undef;
a7fbee98
JH
224
225 if (defined $self->{normalization}) {
226 eval { require Unicode::Normalize };
91ae00cb 227 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 228
91ae00cb 229 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 230
91ae00cb
NC
231 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
232 $self->{normCode} = \&Unicode::Normalize::NFD;
233 }
234 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f
RGS
235 my $norm = $self->{normalization};
236 $self->{normCode} = sub {
1d2654e1
JH
237 Unicode::Normalize::normalize($norm, shift);
238 };
06c8fc8f
RGS
239 eval { $self->{normCode}->("") }; # try
240 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 241 }
a7fbee98 242 }
0116f5dc
JH
243 return;
244}
245
246sub new
247{
248 my $class = shift;
249 my $self = bless { @_ }, $class;
45394607 250
19265284
CBW
251### begin XS only ###
252 if (! exists $self->{table} && !defined $self->{rewrite} &&
f58b9ef1
CBW
253 !defined $self->{undefName} && !defined $self->{ignoreName} &&
254 !defined $self->{undefChar} && !defined $self->{ignoreChar}) {
255 $self->{__useXS} = \&_fetch_simple;
19265284
CBW
256 } else {
257 $self->{__useXS} = undef;
258 }
259### end XS only ###
f58b9ef1 260
aa7758f7
CBW
261 # keys of $self->{suppressHash} are $self->{suppress}.
262 if ($self->{suppress} && @{ $self->{suppress} }) {
263 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
264 } # before read_table()
265
a7fbee98 266 # If undef is passed explicitly, no file is read.
0116f5dc 267 $self->{table} = $KeyFile if ! exists $self->{table};
3756e7ca 268 $self->read_table() if defined $self->{table};
905aa9f0 269
a7fbee98 270 if ($self->{entry}) {
e7f779c8
RGS
271 while ($self->{entry} =~ /([^\n]+)/g) {
272 $self->parseEntry($1);
273 }
a7fbee98 274 }
905aa9f0 275
9f1f04a1 276 $self->{level} ||= MaxLevel;
0116f5dc 277 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 278
abd1ec54 279 $self->{overrideHangul} = FALSE
0116f5dc 280 if ! exists $self->{overrideHangul};
abd1ec54 281 $self->{overrideCJK} = FALSE
0116f5dc 282 if ! exists $self->{overrideCJK};
06c8fc8f 283 $self->{normalization} = 'NFD'
0116f5dc 284 if ! exists $self->{normalization};
3756e7ca
RGS
285 $self->{rearrange} = $self->{rearrangeTable} ||
286 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
a7fbee98 287 if ! exists $self->{rearrange};
0116f5dc
JH
288 $self->{backwards} = $self->{backwardsTable}
289 if ! exists $self->{backwards};
a7fbee98 290
3756e7ca 291 $self->checkCollator();
a7fbee98
JH
292
293 return $self;
294}
905aa9f0 295
00e00351
CBW
296sub parseAtmark {
297 my $self = shift;
298 my $line = shift; # after s/^\s*\@//
299
300 if ($line =~ /^version\s*(\S*)/) {
301 $self->{versionTable} ||= $1;
302 }
303 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
304 $self->{variableTable} ||= $1;
305 }
306 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
307 $self->{alternateTable} ||= $1;
308 }
309 elsif ($line =~ /^backwards\s+(\S*)/) {
310 push @{ $self->{backwardsTable} }, $1;
311 }
312 elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
313 push @{ $self->{forwardsTable} }, $1;
314 }
315 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
316 push @{ $self->{rearrangeTable} }, _getHexArray($1);
317 }
318}
319
905aa9f0 320sub read_table {
a7fbee98 321 my $self = shift;
a7fbee98 322
19265284 323### begin XS only ###
f58b9ef1
CBW
324 if ($self->{__useXS}) {
325 my @rest = _fetch_rest(); # complex matter need to parse
326 for my $line (@rest) {
327 next if $line =~ /^\s*#/;
328
329 if ($line =~ s/^\s*\@//) {
330 $self->parseAtmark($line);
331 } else {
332 $self->parseEntry($line);
333 }
334 }
335 return;
336 }
19265284 337### end XS only ###
f58b9ef1 338
e7f779c8
RGS
339 my($f, $fh);
340 foreach my $d (@INC) {
341 $f = File::Spec->catfile($d, @Path, $self->{table});
342 last if open($fh, $f);
343 $f = undef;
344 }
0d50d293
RGS
345 if (!defined $f) {
346 $f = File::Spec->catfile(@Path, $self->{table});
347 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
348 }
a7fbee98 349
6d24ed10
SP
350 while (my $line = <$fh>) {
351 next if $line =~ /^\s*#/;
abd1ec54 352
00e00351
CBW
353 if ($line =~ s/^\s*\@//) {
354 $self->parseAtmark($line);
355 } else {
356 $self->parseEntry($line);
abd1ec54 357 }
45394607 358 }
e7f779c8 359 close $fh;
45394607
JH
360}
361
905aa9f0 362
45394607
JH
363##
364## get $line, parse it, and write an entry in $self
365##
366sub parseEntry
367{
a7fbee98
JH
368 my $self = shift;
369 my $line = shift;
4d36a948 370 my($name, $entry, @uv, @key);
a7fbee98 371
19265284
CBW
372 if (defined $self->{rewrite}) {
373 $line = $self->{rewrite}->($line);
374 }
375
a7fbee98
JH
376 return if $line !~ /^\s*[0-9A-Fa-f]/;
377
378 # removes comment and gets name
379 $name = $1
380 if $line =~ s/[#%]\s*(.*)//;
381 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
382
383 # gets element
384 my($e, $k) = split /;/, $line;
385 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
386 if ! $k;
387
4d36a948
ST
388 @uv = _getHexArray($e);
389 return if !@uv;
aa7758f7
CBW
390 return if @uv > 1 && $self->{suppressHash} &&
391 exists $self->{suppressHash}{$uv[0]};
4d36a948 392 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 393
4d36a948 394 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 395 my $ele = pack_U(@uv);
a7fbee98 396
4d36a948
ST
397 # regarded as if it were not entried in the table
398 return
399 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 400
4d36a948
ST
401 # replaced as completely ignorable
402 $k = '[.0000.0000.0000.0000]'
403 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 404 }
0116f5dc 405
4d36a948
ST
406 # replaced as completely ignorable
407 $k = '[.0000.0000.0000.0000]'
408 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
409
4c843366 410 my $is_L3_ignorable = TRUE;
4d36a948 411
caffd4cf
ST
412 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
413 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948
ST
414 my @wt = _getHexArray($arr);
415 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 416 $is_L3_ignorable = FALSE
3756e7ca
RGS
417 if $wt[0] || $wt[1] || $wt[2];
418 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
419 # is completely ignorable.
4c843366
JH
420 # For expansion, an entry $is_L3_ignorable
421 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 422 }
caffd4cf 423
e7f779c8 424 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
caffd4cf 425
91ae00cb 426 if (@uv > 1) {
b5d9a953
CBW
427 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
428 $self->{maxlength}{$uv[0]} = @uv;
429 }
430 }
431 if (@uv > 2) {
432 while (@uv) {
433 pop @uv;
434 my $fake_entry = join(CODE_SEP, @uv); # in JCPS
435 $self->{contraction}{$fake_entry} = 1;
436 }
91ae00cb 437 }
45394607
JH
438}
439
9f1f04a1 440
45394607
JH
441sub viewSortKey
442{
a7fbee98 443 my $self = shift;
9f1f04a1
RGS
444 $self->visualizeSortKey($self->getSortKey(@_));
445}
0116f5dc 446
d16e9e3d 447
45394607 448##
91ae00cb
NC
449## arrayref of JCPS = splitEnt(string to be collated)
450## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
45394607 451##
91ae00cb 452sub splitEnt
45394607 453{
a7fbee98 454 my $self = shift;
4d36a948
ST
455 my $wLen = $_[1];
456
a7fbee98 457 my $code = $self->{preprocess};
0116f5dc 458 my $norm = $self->{normCode};
91ae00cb 459 my $map = $self->{mapping};
a7fbee98
JH
460 my $max = $self->{maxlength};
461 my $reH = $self->{rearrangeHash};
b5d9a953
CBW
462 my $vers = $self->{UCA_Version};
463 my $ver9 = $vers >= 9 && $vers <= 11;
19265284 464 my $uXS = $self->{__useXS}; ### XS only
a7fbee98 465
4d36a948 466 my ($str, @buf);
a7fbee98 467
4d36a948
ST
468 if ($wLen) {
469 $code and croak "Preprocess breaks character positions. "
470 . "Don't use with index(), match(), etc.";
471 $norm and croak "Normalization breaks character positions. "
472 . "Don't use with index(), match(), etc.";
473 $str = $_[0];
474 }
475 else {
476 $str = $_[0];
477 $str = &$code($str) if ref $code;
478 $str = &$norm($str) if ref $norm;
479 }
a7fbee98 480
4d36a948 481 # get array of Unicode code point of string.
9f1f04a1 482 my @src = unpack_U($str);
4d36a948
ST
483
484 # rearrangement:
485 # Character positions are not kept if rearranged,
486 # then neglected if $wLen is true.
487 if ($reH && ! $wLen) {
a7fbee98
JH
488 for (my $i = 0; $i < @src; $i++) {
489 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
490 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
491 $i++;
492 }
493 }
45394607 494 }
45394607 495
3756e7ca 496 # remove a code point marked as a completely ignorable.
abd1ec54 497 for (my $i = 0; $i < @src; $i++) {
b5d9a953
CBW
498 if (_isIllegal($src[$i]) || $vers <= 20 && _isNonchar($src[$i])) {
499 $src[$i] = undef;
500 } elsif ($ver9) {
501 $src[$i] = undef if $map->{ $src[$i] } &&
502 @{ $map->{ $src[$i] } } == 0;
19265284 503### begin XS only ###
f58b9ef1
CBW
504 if ($uXS) {
505 $src[$i] = undef if _ignorable_simple($src[$i]);
506 }
19265284 507### end XS only ###
b5d9a953 508 }
0116f5dc
JH
509 }
510
a7fbee98 511 for (my $i = 0; $i < @src; $i++) {
91ae00cb 512 my $jcps = $src[$i];
3756e7ca
RGS
513
514 # skip removed code point
515 if (! defined $jcps) {
516 if ($wLen && @buf) {
517 $buf[-1][2] = $i + 1;
518 }
519 next;
520 }
521
abd1ec54 522 my $i_orig = $i;
4d36a948 523
3756e7ca
RGS
524 # find contraction
525 if ($max->{$jcps}) {
91ae00cb
NC
526 my $temp_jcps = $jcps;
527 my $jcpsLen = 1;
528 my $maxLen = $max->{$jcps};
4d36a948 529
91ae00cb 530 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 531 next if ! defined $src[$p];
91ae00cb
NC
532 $temp_jcps .= CODE_SEP . $src[$p];
533 $jcpsLen++;
534 if ($map->{$temp_jcps}) {
535 $jcps = $temp_jcps;
4d36a948
ST
536 $i = $p;
537 }
538 }
4d36a948 539
b5d9a953 540 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
06c8fc8f 541 # This process requires Unicode::Normalize.
91ae00cb 542 # If "normalization" is undef, here should be skipped *always*
06c8fc8f
RGS
543 # (in spite of bool value of $CVgetCombinClass),
544 # since canonical ordering cannot be expected.
545 # Blocked combining character should not be contracted.
546
06c8fc8f 547 # $self->{normCode} is false in the case of "prenormalized".
b5d9a953
CBW
548 if ($self->{normalization}) {
549 my $cont = $self->{contraction};
06c8fc8f 550 my $preCC = 0;
b5d9a953
CBW
551 my $preCC_uc = 0;
552 my $jcps_uc = $jcps;
553 my(@out, @out_uc);
06c8fc8f
RGS
554
555 for (my $p = $i + 1; $p < @src; $p++) {
556 next if ! defined $src[$p];
b5d9a953 557 my $curCC = $CVgetCombinClass->($src[$p]);
06c8fc8f
RGS
558 last unless $curCC;
559 my $tail = CODE_SEP . $src[$p];
b5d9a953
CBW
560
561 if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
562 $cont->{$jcps_uc.$tail})) {
563 $jcps_uc .= $tail;
564 push @out_uc, $p;
565 } else {
566 $preCC_uc = $curCC;
567 }
568
91ae00cb
NC
569 if ($preCC != $curCC && $map->{$jcps.$tail}) {
570 $jcps .= $tail;
b5d9a953 571 push @out, $p;
06c8fc8f
RGS
572 } else {
573 $preCC = $curCC;
574 }
4d36a948 575 }
b5d9a953
CBW
576
577 if ($map->{$jcps_uc}) {
578 $jcps = $jcps_uc;
579 $src[$_] = undef for @out_uc;
580 } else {
581 $src[$_] = undef for @out;
582 }
a7fbee98 583 }
a7fbee98
JH
584 }
585
3756e7ca 586 # skip completely ignorable
19265284
CBW
587
588 if ($uXS && $jcps !~ /;/ && _ignorable_simple($jcps) || ### XS only
f58b9ef1 589 $map->{$jcps} && @{ $map->{$jcps} } == 0) {
3756e7ca
RGS
590 if ($wLen && @buf) {
591 $buf[-1][2] = $i + 1;
a7fbee98 592 }
3756e7ca 593 next;
a7fbee98 594 }
4d36a948 595
91ae00cb 596 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 597 }
4d36a948 598 return \@buf;
d16e9e3d 599}
45394607 600
539ce3d8
CBW
601##
602## VCE = _pack_override(input, codepoint, derivCode)
603##
604sub _pack_override ($$$) {
605 my $r = shift;
606 my $u = shift;
607 my $der = shift;
608
609 if (ref $r) {
610 return pack(VCE_TEMPLATE, NON_VAR, @$r);
611 } elsif (defined $r) {
612 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
613 } else {
614 return $der->($u);
615 }
616}
d16e9e3d
JH
617
618##
abd1ec54 619## list of VCE = getWt(JCPS)
d16e9e3d
JH
620##
621sub getWt
622{
a7fbee98 623 my $self = shift;
91ae00cb
NC
624 my $u = shift;
625 my $map = $self->{mapping};
0116f5dc 626 my $der = $self->{derivCode};
19265284 627 my $uXS = $self->{__useXS}; ### XS only
a7fbee98 628
91ae00cb 629 return if !defined $u;
19265284 630 return map($self->varCE($_), @{ $map->{$u} })
91ae00cb 631 if $map->{$u};
19265284
CBW
632### begin XS only ###
633 return map($self->varCE($_), _fetch_simple($u))
f58b9ef1 634 if $uXS && _exists_simple($u);
19265284 635### end XS only ###
a7fbee98 636
91ae00cb
NC
637 # JCPS must not be a contraction, then it's a code point.
638 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1
JH
639 my $hang = $self->{overrideHangul};
640 my @hangulCE;
641 if ($hang) {
539ce3d8
CBW
642 @hangulCE = map _pack_override($_, $u, $der), $hang->($u);
643 } elsif (!defined $hang) {
1d2654e1 644 @hangulCE = $der->($u);
539ce3d8 645 } else {
1d2654e1
JH
646 my $max = $self->{maxlength};
647 my @decH = _decompHangul($u);
648
649 if (@decH == 2) {
650 my $contract = join(CODE_SEP, @decH);
91ae00cb 651 @decH = ($contract) if $map->{$contract};
1d2654e1
JH
652 } else { # must be <@decH == 3>
653 if ($max->{$decH[0]}) {
654 my $contract = join(CODE_SEP, @decH);
91ae00cb 655 if ($map->{$contract}) {
1d2654e1
JH
656 @decH = ($contract);
657 } else {
658 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 659 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1
JH
660 }
661 # even if V's ignorable, LT contraction is not supported.
f58b9ef1 662 # If such a situation were required, NFD should be used.
1d2654e1
JH
663 }
664 if (@decH == 3 && $max->{$decH[1]}) {
665 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 666 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1
JH
667 }
668 }
669
670 @hangulCE = map({
f58b9ef1 671 $map->{$_} ? @{ $map->{$_} } :
19265284 672 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
f58b9ef1 673 $der->($_);
1d2654e1
JH
674 } @decH);
675 }
19265284 676 return map $self->varCE($_), @hangulCE;
539ce3d8 677 } else {
584e761d
CBW
678 my $cjk = $self->{overrideCJK};
679 my $vers = $self->{UCA_Version};
680 if ($cjk && _isUIdeo($u, $vers)) {
681 my @cjkCE = map _pack_override($_, $u, $der), $cjk->($u);
19265284 682 return map $self->varCE($_), @cjkCE;
584e761d
CBW
683 }
684 if ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
19265284 685 return map $self->varCE($_), _uideoCE_8($u);
584e761d 686 }
19265284 687 return map $self->varCE($_), $der->($u);
a7fbee98 688 }
d16e9e3d
JH
689}
690
d16e9e3d
JH
691
692##
693## string sortkey = getSortKey(string arg)
694##
695sub getSortKey
696{
a7fbee98 697 my $self = shift;
91ae00cb 698 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
00e00351 699 my $vers = $self->{UCA_Version};
aa7758f7 700 my $term = $self->{hangul_terminator};
91ae00cb 701
abd1ec54 702 my @buf; # weight arrays
aa7758f7 703 if ($term) {
91ae00cb 704 my $preHST = '';
19265284 705 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
91ae00cb
NC
706 foreach my $jcps (@$rEnt) {
707 # weird things like VL, TL-contraction are not considered!
aa7758f7 708 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
91ae00cb
NC
709 if ($preHST && !$curHST || # hangul before non-hangul
710 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
711 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
712 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
aa7758f7 713 push @buf, $termCE;
91ae00cb
NC
714 }
715 $preHST = $curHST;
abd1ec54 716 push @buf, $self->getWt($jcps);
91ae00cb 717 }
aa7758f7
CBW
718 push @buf, $termCE if $preHST; # end at hangul
719 } else {
91ae00cb 720 foreach my $jcps (@$rEnt) {
abd1ec54 721 push @buf, $self->getWt($jcps);
91ae00cb
NC
722 }
723 }
724
19265284 725 return $self->mk_SortKey(\@buf); ### XS only
45394607
JH
726}
727
728
729##
d16e9e3d 730## int compare = cmp(string a, string b)
45394607 731##
5398038e 732sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
733sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
734sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
735sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
736sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
737sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
738sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607
JH
739
740##
d16e9e3d 741## list[strings] sorted = sort(list[strings] arg)
45394607 742##
a7fbee98
JH
743sub sort {
744 my $obj = shift;
745 return
746 map { $_->[1] }
747 sort{ $a->[0] cmp $b->[0] }
748 map [ $obj->getSortKey($_), $_ ], @_;
45394607
JH
749}
750
0116f5dc 751
4d36a948
ST
752##
753## bool _nonIgnorAtLevel(arrayref weights, int level)
754##
755sub _nonIgnorAtLevel($$)
756{
757 my $wt = shift;
758 return if ! defined $wt;
759 my $lv = shift;
9f1f04a1 760 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948
ST
761}
762
763##
764## bool _eqArray(
765## arrayref of arrayref[weights] source,
766## arrayref of arrayref[weights] substr,
767## int level)
768## * comparison of graphemes vs graphemes.
769## @$source >= @$substr must be true (check it before call this);
770##
771sub _eqArray($$$)
772{
773 my $source = shift;
774 my $substr = shift;
775 my $lev = shift;
776
777 for my $g (0..@$substr-1){
f58b9ef1 778 # Do the $g'th graphemes have the same number of AV weights?
4d36a948
ST
779 return if @{ $source->[$g] } != @{ $substr->[$g] };
780
781 for my $w (0..@{ $substr->[$g] }-1) {
782 for my $v (0..$lev-1) {
783 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
784 }
785 }
786 }
787 return 1;
788}
789
790##
791## (int position, int length)
792## int position = index(string, substring, position, [undoc'ed grobal])
793##
794## With "grobal" (only for the list context),
795## returns list of arrayref[position, length].
796##
797sub index
798{
91ae00cb
NC
799 my $self = shift;
800 my $str = shift;
801 my $len = length($str);
802 my $subE = $self->splitEnt(shift);
803 my $pos = @_ ? shift : 0;
804 $pos = 0 if $pos < 0;
805 my $grob = shift;
806
807 my $lev = $self->{level};
3756e7ca
RGS
808 my $v2i = $self->{UCA_Version} >= 9 &&
809 $self->{variable} ne 'non-ignorable';
91ae00cb
NC
810
811 if (! @$subE) {
4d36a948
ST
812 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
813 return $grob
814 ? map([$_, 0], $temp..$len)
815 : wantarray ? ($temp,0) : $temp;
816 }
abd1ec54
NC
817 $len < $pos
818 and return wantarray ? () : NOMATCHPOS;
91ae00cb 819 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
abd1ec54
NC
820 @$strE
821 or return wantarray ? () : NOMATCHPOS;
822
4d36a948
ST
823 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
824
abd1ec54
NC
825 my $last_is_variable;
826 for my $vwt (map $self->getWt($_), @$subE) {
827 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
828 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 829
3756e7ca
RGS
830 # "Ignorable (L1, L2) after Variable" since track. v. 9
831 if ($v2i) {
abd1ec54
NC
832 if ($var) {
833 $last_is_variable = TRUE;
834 }
835 elsif (!$wt[0]) { # ignorable
4d36a948 836 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
837 }
838 else {
839 $last_is_variable = FALSE;
4d36a948
ST
840 }
841 }
842
abd1ec54
NC
843 if (@subWt && !$var && !$wt[0]) {
844 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
1393fe00 845 } elsif ($to_be_pushed) {
abd1ec54 846 push @subWt, [ \@wt ];
4d36a948 847 }
1393fe00 848 # else ===> skipped
4d36a948
ST
849 }
850
851 my $count = 0;
91ae00cb 852 my $end = @$strE - 1;
4d36a948 853
abd1ec54 854 $last_is_variable = FALSE; # reuse
4d36a948
ST
855 for (my $i = 0; $i <= $end; ) { # no $i++
856 my $found_base = 0;
857
858 # fetch a grapheme
859 while ($i <= $end && $found_base == 0) {
abd1ec54
NC
860 for my $vwt ($self->getWt($strE->[$i][0])) {
861 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
862 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 863
3756e7ca
RGS
864 # "Ignorable (L1, L2) after Variable" since track. v. 9
865 if ($v2i) {
abd1ec54
NC
866 if ($var) {
867 $last_is_variable = TRUE;
868 }
869 elsif (!$wt[0]) { # ignorable
4d36a948 870 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
871 }
872 else {
873 $last_is_variable = FALSE;
4d36a948
ST
874 }
875 }
876
abd1ec54
NC
877 if (@strWt && !$var && !$wt[0]) {
878 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
91ae00cb 879 $finPos[-1] = $strE->[$i][2];
4d36a948 880 } elsif ($to_be_pushed) {
abd1ec54 881 push @strWt, [ \@wt ];
91ae00cb 882 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 883 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 884 push @finPos, $strE->[$i][2];
4d36a948
ST
885 $found_base++;
886 }
887 # else ===> no-op
888 }
889 $i++;
890 }
891
892 # try to match
893 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
894 if ($iniPos[0] != NOMATCHPOS &&
895 $finPos[$#subWt] != NOMATCHPOS &&
896 _eqArray(\@strWt, \@subWt, $lev)) {
897 my $temp = $iniPos[0] + $pos;
898
899 if ($grob) {
900 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
901 splice @strWt, 0, $#subWt;
902 splice @iniPos, 0, $#subWt;
903 splice @finPos, 0, $#subWt;
904 }
905 else {
906 return wantarray
907 ? ($temp, $finPos[$#subWt] - $iniPos[0])
908 : $temp;
909 }
910 }
911 shift @strWt;
912 shift @iniPos;
913 shift @finPos;
914 }
915 }
916
917 return $grob
918 ? @g_ret
919 : wantarray ? () : NOMATCHPOS;
920}
921
922##
923## scalarref to matching part = match(string, substring)
924##
925sub match
926{
927 my $self = shift;
928 if (my($pos,$len) = $self->index($_[0], $_[1])) {
929 my $temp = substr($_[0], $pos, $len);
930 return wantarray ? $temp : \$temp;
931 # An lvalue ref \substr should be avoided,
932 # since its value is affected by modification of its referent.
933 }
934 else {
935 return;
936 }
937}
938
939##
940## arrayref matching parts = gmatch(string, substring)
941##
942sub gmatch
943{
944 my $self = shift;
945 my $str = shift;
946 my $sub = shift;
947 return map substr($str, $_->[0], $_->[1]),
948 $self->index($str, $sub, 0, 'g');
949}
950
951##
952## bool subst'ed = subst(string, substring, replace)
953##
954sub subst
955{
956 my $self = shift;
957 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
958
959 if (my($pos,$len) = $self->index($_[0], $_[1])) {
960 if ($code) {
961 my $mat = substr($_[0], $pos, $len);
962 substr($_[0], $pos, $len, $code->($mat));
963 } else {
964 substr($_[0], $pos, $len, $_[2]);
965 }
966 return TRUE;
967 }
968 else {
969 return FALSE;
970 }
971}
972
973##
974## int count = gsubst(string, substring, replace)
975##
976sub gsubst
977{
978 my $self = shift;
979 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
980 my $cnt = 0;
981
982 # Replacement is carried out from the end, then use reverse.
983 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
984 if ($code) {
985 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
986 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
987 } else {
988 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
989 }
990 $cnt++;
991 }
992 return $cnt;
993}
994
45394607
JH
9951;
996__END__
997
998=head1 NAME
999
a7fbee98 1000Unicode::Collate - Unicode Collation Algorithm
45394607
JH
1001
1002=head1 SYNOPSIS
1003
1004 use Unicode::Collate;
1005
1006 #construct
5398038e 1007 $Collator = Unicode::Collate->new(%tailoring);
45394607
JH
1008
1009 #sort
5398038e 1010 @sorted = $Collator->sort(@not_sorted);
45394607
JH
1011
1012 #compare
a7fbee98 1013 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1014
539ce3d8
CBW
1015B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1016according to Perl's Unicode support. See L<perlunicode>,
1017L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1018Otherwise you can use C<preprocess> or should decode them before.
91ae00cb 1019
45394607
JH
1020=head1 DESCRIPTION
1021
3756e7ca
RGS
1022This module is an implementation of Unicode Technical Standard #10
1023(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
4d36a948 1024
45394607
JH
1025=head2 Constructor and Tailoring
1026
539ce3d8
CBW
1027The C<new> method returns a collator object. If new() is called
1028with no parameters, the collator should do the default collation.
d16e9e3d 1029
5398038e 1030 $Collator = Unicode::Collate->new(
0116f5dc 1031 UCA_Version => $UCA_Version,
aa7758f7 1032 alternate => $alternate, # alias for 'variable'
45394607
JH
1033 backwards => $levelNumber, # or \@levelNumbers
1034 entry => $element,
91ae00cb 1035 hangul_terminator => $term_primary_weight,
45394607
JH
1036 ignoreName => qr/$ignoreName/,
1037 ignoreChar => qr/$ignoreChar/,
19265284 1038 ignore_level2 => $bool,
45394607
JH
1039 katakana_before_hiragana => $bool,
1040 level => $collationLevel,
91ae00cb 1041 normalization => $normalization_form,
45394607
JH
1042 overrideCJK => \&overrideCJK,
1043 overrideHangul => \&overrideHangul,
1044 preprocess => \&preprocess,
1045 rearrange => \@charList,
19265284 1046 rewrite => \&rewrite,
aa7758f7 1047 suppress => \@charList,
45394607
JH
1048 table => $filename,
1049 undefName => qr/$undefName/,
1050 undefChar => qr/$undefChar/,
1051 upper_before_lower => $bool,
91ae00cb 1052 variable => $variable,
45394607 1053 );
45394607
JH
1054
1055=over 4
1056
0116f5dc
JH
1057=item UCA_Version
1058
7b98b857
CBW
1059If the revision (previously "tracking version") number of UCA is given,
1060behavior of that revision is emulated on collating.
0116f5dc 1061If omitted, the return value of C<UCA_Version()> is used.
3756e7ca 1062
7b98b857 1063The following revisions are supported. The default is 22.
3756e7ca 1064
0d50d293 1065 UCA Unicode Standard DUCET (@version)
b5d9a953 1066 -------------------------------------------------------
0d50d293
RGS
1067 8 3.1 3.0.1 (3.0.1d9)
1068 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1069 11 4.0 4.0.0 (4.0.0)
1070 14 4.1.0 4.1.0 (4.1.0)
74b94a79
CBW
1071 16 5.0 5.0.0 (5.0.0)
1072 18 5.1.0 5.1.0 (5.1.0)
00e00351 1073 20 5.2.0 5.2.0 (5.2.0)
b5d9a953 1074 22 6.0.0 6.0.0 (6.0.0)
3756e7ca 1075
f58b9ef1 1076* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
b5d9a953
CBW
1077since C<UCA_Version> 22.
1078
1079* Fully ignorable characters were ignored, and would not interrupt
1080contractions with C<UCA_Version> 9 and 11.
1081
1082* Treatment of ignorables after variables and some behaviors
1083were changed at C<UCA_Version> 9.
1084
1085* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1086depend on C<UCA_Version>.
1087
1088* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1089C<hangul_terminator>.
1090
3756e7ca 1091=item alternate
0116f5dc 1092
3756e7ca 1093-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
0116f5dc 1094
3756e7ca
RGS
1095For backward compatibility, C<alternate> (old name) can be used
1096as an alias for C<variable>.
0116f5dc 1097
45394607
JH
1098=item backwards
1099
4d36a948 1100-- see 3.1.2 French Accents, UTS #10.
45394607
JH
1101
1102 backwards => $levelNumber or \@levelNumbers
1103
1104Weights in reverse order; ex. level 2 (diacritic ordering) in French.
68adb2b0
CBW
1105If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1106forwards at all the levels.
45394607
JH
1107
1108=item entry
1109
4d36a948 1110-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1111
91ae00cb
NC
1112If the same character (or a sequence of characters) exists
1113in the collation element table through C<table>,
f58b9ef1 1114mapping to collation elements is overridden.
91ae00cb 1115If it does not exist, the mapping is defined additionally.
45394607 1116
abd1ec54
NC
1117 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11180063 0068 ; [.0E6A.0020.0002.0063] # ch
11190043 0068 ; [.0E6A.0020.0007.0043] # Ch
11200043 0048 ; [.0E6A.0020.0008.0043] # CH
1121006C 006C ; [.0F4C.0020.0002.006C] # ll
1122004C 006C ; [.0F4C.0020.0007.004C] # Ll
1123004C 004C ; [.0F4C.0020.0008.004C] # LL
e7f779c8
RGS
112400F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1125006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
112600D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1127004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
abd1ec54
NC
1128ENTRY
1129
1130 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
113100E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
113200C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1133ENTRY
45394607 1134
4d36a948 1135B<NOTE:> The code point in the UCA file format (before C<';'>)
abd1ec54
NC
1136B<must> be a Unicode code point (defined as hexadecimal),
1137but not a native code point.
4d36a948
ST
1138So C<0063> must always denote C<U+0063>,
1139but not a character of C<"\x63">.
1140
abd1ec54
NC
1141Weighting may vary depending on collation element table.
1142So ensure the weights defined in C<entry> will be consistent with
1143those in the collation element table loaded via C<table>.
1144
1145In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1146and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1147(as a value between C<0E60> and C<0E6D>)
1148makes ordering as C<C E<lt> CH E<lt> D>.
1149Exactly speaking DUCET already has some characters between C<C> and C<D>:
1150C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1151C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1152and C<c-curl> (C<U+0255>) with C<0E69>.
1153Then primary weight C<0E6A> for C<CH> makes C<CH>
1154ordered between C<c-curl> and C<D>.
1155
91ae00cb
NC
1156=item hangul_terminator
1157
3756e7ca 1158-- see 7.1.4 Trailing Weights, UTS #10.
91ae00cb
NC
1159
1160If a true value is given (non-zero but should be positive),
1161it will be added as a terminator primary weight to the end of
1162every standard Hangul syllable. Secondary and any higher weights
1163for terminator are set to zero.
1164If the value is false or C<hangul_terminator> key does not exist,
1165insertion of terminator weights will not be performed.
1166
1167Boundaries of Hangul syllables are determined
1168according to conjoining Jamo behavior in F<the Unicode Standard>
1169and F<HangulSyllableType.txt>.
1170
1171B<Implementation Note:>
1172(1) For expansion mapping (Unicode character mapped
1173to a sequence of collation elements), a terminator will not be added
1174between collation elements, even if Hangul syllable boundary exists there.
1175Addition of terminator is restricted to the next position
1176to the last collation element.
1177
1178(2) Non-conjoining Hangul letters
1179(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1180automatically terminated with a terminator primary weight.
1181These characters may need terminator included in a collation element
1182table beforehand.
1183
45394607
JH
1184=item ignoreChar
1185
3756e7ca
RGS
1186=item ignoreName
1187
1188-- see 3.2.2 Variable Weighting, UTS #10.
45394607 1189
caffd4cf
ST
1190Makes the entry in the table completely ignorable;
1191i.e. as if the weights were zero at all level.
45394607 1192
3756e7ca
RGS
1193Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1194will be ignored. Through C<ignoreName>, any character whose name
1195(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1196will be ignored.
1197
a7fbee98 1198E.g. when 'a' and 'e' are ignorable,
45394607
JH
1199'element' is equal to 'lament' (or 'lmnt').
1200
19265284
CBW
1201=item ignore_level2
1202
1203-- see 5.1 Parametric Tailoring, UTS #10.
1204
1205By default, case-sensitive comparison (that is level 3 difference)
1206won't ignore accents (that is level 2 difference).
1207
1208If the parameter is made true, accents (and other primary ignorable
1209characters) are ignored, even though cases are taken into account.
1210
1211B<NOTE>: C<level> should be 3 or greater.
1212
3756e7ca
RGS
1213=item katakana_before_hiragana
1214
1215-- see 7.3.1 Tertiary Weight Table, UTS #10.
1216
1217By default, hiragana is before katakana.
1218If the parameter is made true, this is reversed.
1219
1220B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1221distinctions must occur in level 3, and their weights at level 3 must be
1222same as those mentioned in 7.3.1, UTS #10.
1223If you define your collation elements which violate this requirement,
1224this parameter does not work validly.
1225
45394607
JH
1226=item level
1227
3756e7ca 1228-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1229
1230Set the maximum level.
1231Any higher levels than the specified one are ignored.
1232
1233 Level 1: alphabetic ordering
1234 Level 2: diacritic ordering
1235 Level 3: case ordering
91ae00cb 1236 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607
JH
1237
1238 ex.level => 2,
1239
a7fbee98
JH
1240If omitted, the maximum is the 4th.
1241
45394607
JH
1242=item normalization
1243
3756e7ca 1244-- see 4.1 Normalize, UTS #10.
45394607 1245
905aa9f0 1246If specified, strings are normalized before preparation of sort keys
45394607
JH
1247(the normalization is executed after preprocess).
1248
1d2654e1
JH
1249A form name C<Unicode::Normalize::normalize()> accepts will be applied
1250as C<$normalization_form>.
06c8fc8f 1251Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1
JH
1252See C<Unicode::Normalize::normalize()> for detail.
1253If omitted, C<'NFD'> is used.
45394607 1254
91ae00cb 1255C<normalization> is performed after C<preprocess> (if defined).
45394607 1256
06c8fc8f
RGS
1257Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1258though they are not concerned with C<Unicode::Normalize::normalize()>.
1259
1260If C<undef> (not a string C<"undef">) is passed explicitly
1261as the value for this key,
45394607 1262any normalization is not carried out (this may make tailoring easier
abd1ec54
NC
1263if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1264only contiguous contractions are resolved;
1265e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1266C<A-cedilla-ring> would be primary equal to C<A>.
06c8fc8f
RGS
1267In this point,
1268C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1269B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1270
1271In the case of C<(normalization =E<gt> "prenormalized")>,
1272any normalization is not performed, but
b5d9a953 1273discontiguous contractions with combining characters are performed.
06c8fc8f
RGS
1274Therefore
1275C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1276B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1277If source strings are finely prenormalized,
1278C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1279
1280Except C<(normalization =E<gt> undef)>,
1281B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607
JH
1282
1283=item overrideCJK
1284
4d36a948 1285-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1286
584e761d
CBW
1287By default, CJK unified ideographs are ordered in Unicode codepoint
1288order, but those in the CJK Unified Ideographs block are lesser than
1289those in the CJK Unified Ideographs Extension A etc.
00e00351 1290
b5d9a953
CBW
1291 In the CJK Unified Ideographs block:
1292 U+4E00..U+9FA5 if UCA_Version is 8 to 11.
1293 U+4E00..U+9FBB if UCA_Version is 14 to 16.
1294 U+4E00..U+9FC3 if UCA_Version is 18.
1295 U+4E00..U+9FCB if UCA_Version is 20 or greater.
00e00351 1296
b5d9a953
CBW
1297 In the CJK Unified Ideographs Extension blocks:
1298 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
1299 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or greater.
1300 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or greater.
91ae00cb 1301
584e761d 1302Through C<overrideCJK>, ordering of CJK unified ideographs (including
f58b9ef1 1303extensions) can be overridden.
45394607 1304
584e761d 1305ex. CJK unified ideographs in the JIS code point order.
45394607
JH
1306
1307 overrideCJK => sub {
a7fbee98
JH
1308 my $u = shift; # get a Unicode codepoint
1309 my $b = pack('n', $u); # to UTF-16BE
1310 my $s = your_unicode_to_sjis_converter($b); # convert
1311 my $n = unpack('n', $s); # convert sjis to short
1312 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607
JH
1313 },
1314
539ce3d8
CBW
1315The return value may be an arrayref of 1st to 4th weights as shown
1316above. The return value may be an integer as the primary weight
1317as shown below. If C<undef> is returned, the default derived
1318collation element will be used.
1319
1320 overrideCJK => sub {
1321 my $u = shift; # get a Unicode codepoint
1322 my $b = pack('n', $u); # to UTF-16BE
1323 my $s = your_unicode_to_sjis_converter($b); # convert
1324 my $n = unpack('n', $s); # convert sjis to short
1325 return $n; # return the primary weight
1326 },
1327
1328The return value may be a list containing zero or more of
1329an arrayref, an integer, or C<undef>.
1330
584e761d 1331ex. ignores all CJK unified ideographs.
a7fbee98
JH
1332
1333 overrideCJK => sub {()}, # CODEREF returning empty list
1334
1335 # where ->eq("Pe\x{4E00}rl", "Perl") is true
584e761d 1336 # as U+4E00 is a CJK unified ideograph and to be ignorable.
a7fbee98
JH
1337
1338If C<undef> is passed explicitly as the value for this key,
584e761d
CBW
1339weights for CJK unified ideographs are treated as undefined.
1340But assignment of weight for CJK unified ideographs
1341in C<table> or C<entry> is still valid.
1342
1343B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1344C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1345C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
f58b9ef1 1346ideographs. But they can't be overridden via C<overrideCJK> when you use
584e761d
CBW
1347DUCET, as the table includes weights for them. C<table> or C<entry> has
1348priority over C<overrideCJK>.
a7fbee98
JH
1349
1350=item overrideHangul
1351
4d36a948 1352-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1353
584e761d 1354By default, Hangul syllables are decomposed into Hangul Jamo,
abd1ec54 1355even if C<(normalization =E<gt> undef)>.
f58b9ef1 1356But the mapping of Hangul syllables may be overridden.
a7fbee98 1357
3756e7ca 1358This parameter works like C<overrideCJK>, so see there for examples.
a7fbee98 1359
584e761d
CBW
1360If you want to override the mapping of Hangul syllables,
1361NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1362Hangul syllables before overriding. FCD may decompose Hangul syllables
1363as the case may be.
45394607 1364
a7fbee98 1365If C<undef> is passed explicitly as the value for this key,
584e761d 1366weight for Hangul syllables is treated as undefined
a7fbee98 1367without decomposition into Hangul Jamo.
584e761d
CBW
1368But definition of weight for Hangul syllables
1369in C<table> or C<entry> is still valid.
a7fbee98 1370
45394607
JH
1371=item preprocess
1372
4d36a948 1373-- see 5.1 Preprocessing, UTS #10.
45394607 1374
19265284 1375If specified, the coderef is used to preprocess each string
45394607
JH
1376before the formation of sort keys.
1377
a7fbee98 1378ex. dropping English articles, such as "a" or "the".
45394607
JH
1379Then, "the pen" is before "a pencil".
1380
1381 preprocess => sub {
1382 my $str = shift;
a7fbee98 1383 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1384 return $str;
45394607
JH
1385 },
1386
91ae00cb 1387C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1388
539ce3d8
CBW
1389ex. decoding strings in a legacy encoding such as shift-jis:
1390
1391 $sjis_collator = Unicode::Collate->new(
1392 preprocess => \&your_shiftjis_to_unicode_decoder,
1393 );
1394 @result = $sjis_collator->sort(@shiftjis_strings);
1395
1396B<Note:> Strings returned from the coderef will be interpreted
1397according to Perl's Unicode support. See L<perlunicode>,
1398L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1399
45394607
JH
1400=item rearrange
1401
4d36a948 1402-- see 3.1.3 Rearrangement, UTS #10.
45394607
JH
1403
1404Characters that are not coded in logical order and to be rearranged.
3756e7ca 1405If C<UCA_Version> is equal to or lesser than 11, default is:
45394607
JH
1406
1407 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1408
3756e7ca
RGS
1409If you want to disallow any rearrangement, pass C<undef> or C<[]>
1410(a reference to empty list) as the value for this key.
1411
74b94a79
CBW
1412If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1413(i.e. no rearrangement).
a7fbee98 1414
0116f5dc
JH
1415B<According to the version 9 of UCA, this parameter shall not be used;
1416but it is not warned at present.>
1417
19265284
CBW
1418=item rewrite
1419
1420If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1421The coderef will get each line, and then should return a rewritten line
1422according to the UCA file format.
1423If the coderef returns an empty line, the line will be skipped.
1424
1425e.g. any primary ignorable characters into tertiary ignorable:
1426
1427 rewrite => sub {
1428 my $line = shift;
1429 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1430 return $line;
1431 },
1432
1433This example shows rewriting weights. C<rewrite> is allowed to
1434affect code points, weights, and the name.
1435
1436B<NOTE>: C<table> is available to use another table file;
1437preparing a modified table once would be more efficient than
1438rewriting lines on reading an unmodified table every time.
1439
aa7758f7
CBW
1440=item suppress
1441
1442-- see suppress contractions in 5.14.11 Special-Purpose Commands,
1443UTS #35 (LDML).
1444
539ce3d8 1445Contractions beginning with the specified characters are suppressed,
584e761d 1446even if those contractions are defined in C<table> or C<entry>.
aa7758f7
CBW
1447
1448An example for Russian and some languages using the Cyrillic script:
1449
1450 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1451
1452where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1453
45394607
JH
1454=item table
1455
4d36a948 1456-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1457
91ae00cb 1458You can use another collation element table if desired.
45394607 1459
e7f779c8 1460The table file should locate in the F<Unicode/Collate> directory
3756e7ca
RGS
1461on C<@INC>. Say, if the filename is F<Foo.txt>,
1462the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
e7f779c8
RGS
1463
1464By default, F<allkeys.txt> (as the filename of DUCET) is used.
0d50d293
RGS
1465If you will prepare your own table file, any name other than F<allkeys.txt>
1466may be better to avoid namespace conflict.
45394607 1467
00e00351
CBW
1468B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1469module, and it may save time at the run time.
1470Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
19265284
CBW
1471or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1472C<rewrite> will prevent this module from using the compiled DUCET.
00e00351 1473
a7fbee98 1474If C<undef> is passed explicitly as the value for this key,
91ae00cb 1475no file is read (but you can define collation elements via C<entry>).
a7fbee98
JH
1476
1477A typical way to define a collation element table
1478without any file of table:
1479
1480 $onlyABC = Unicode::Collate->new(
1481 table => undef,
1482 entry => << 'ENTRIES',
14830061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
14840041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
14850062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
14860042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
14870063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
14880043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1489ENTRIES
1490 );
905aa9f0 1491
3756e7ca
RGS
1492If C<ignoreName> or C<undefName> is used, character names should be
1493specified as a comment (following C<#>) on each line.
45394607
JH
1494
1495=item undefChar
1496
3756e7ca
RGS
1497=item undefName
1498
4d36a948 1499-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1500
584e761d 1501Undefines the collation element as if it were unassigned in the C<table>.
45394607
JH
1502This reduces the size of the table.
1503If an unassigned character appears in the string to be collated,
1504the sort key is made from its codepoint
1505as a single-character collation element,
1506as it is greater than any other assigned collation elements
1507(in the codepoint order among the unassigned characters).
1508But, it'd be better to ignore characters
1509unfamiliar to you and maybe never used.
1510
3756e7ca
RGS
1511Through C<undefChar>, any character matching C<qr/$undefChar/>
1512will be undefined. Through C<undefName>, any character whose name
1513(given in the C<table> file as a comment) matches C<qr/$undefName/>
1514will be undefined.
1515
e7f779c8
RGS
1516ex. Collation weights for beyond-BMP characters are not stored in object:
1517
1518 undefChar => qr/[^\0-\x{fffd}]/,
1519
45394607
JH
1520=item upper_before_lower
1521
3756e7ca 1522-- see 6.6 Case Comparisons, UTS #10.
45394607 1523
3756e7ca
RGS
1524By default, lowercase is before uppercase.
1525If the parameter is made true, this is reversed.
45394607 1526
3756e7ca
RGS
1527B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1528distinctions must occur in level 3, and their weights at level 3 must be
1529same as those mentioned in 7.3.1, UTS #10.
1530If you define your collation elements which differs from this requirement,
1531this parameter doesn't work validly.
45394607 1532
91ae00cb
NC
1533=item variable
1534
91ae00cb
NC
1535-- see 3.2.2 Variable Weighting, UTS #10.
1536
91ae00cb
NC
1537This key allows to variable weighting for variable collation elements,
1538which are marked with an ASTERISK in the table
f58b9ef1 1539(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
91ae00cb
NC
1540
1541 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1542
1543These names are case-insensitive.
1544By default (if specification is omitted), 'shifted' is adopted.
1545
1546 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1547 considered at the 4th level.
1548
abd1ec54 1549 'Non-Ignorable' Variable elements are not reset to ignorable.
91ae00cb
NC
1550
1551 'Shifted' Variable elements are made ignorable at levels 1 through 3
1552 their level 4 weight is replaced by the old level 1 weight.
1553 Level 4 weight for Non-Variable elements is 0xFFFF.
1554
1555 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1556 are trimmed.
1557
45394607
JH
1558=back
1559
3164dd77 1560=head2 Methods for Collation
45394607
JH
1561
1562=over 4
1563
5398038e 1564=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607
JH
1565
1566Sorts a list of strings.
1567
5398038e 1568=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607
JH
1569
1570Returns 1 (when C<$a> is greater than C<$b>)
1571or 0 (when C<$a> is equal to C<$b>)
1572or -1 (when C<$a> is lesser than C<$b>).
1573
5398038e 1574=item C<$result = $Collator-E<gt>eq($a, $b)>
1575
1576=item C<$result = $Collator-E<gt>ne($a, $b)>
1577
1578=item C<$result = $Collator-E<gt>lt($a, $b)>
1579
1580=item C<$result = $Collator-E<gt>le($a, $b)>
1581
1582=item C<$result = $Collator-E<gt>gt($a, $b)>
1583
1584=item C<$result = $Collator-E<gt>ge($a, $b)>
1585
a7fbee98 1586They works like the same name operators as theirs.
5398038e 1587
1588 eq : whether $a is equal to $b.
1589 ne : whether $a is not equal to $b.
1590 lt : whether $a is lesser than $b.
1591 le : whether $a is lesser than $b or equal to $b.
1592 gt : whether $a is greater than $b.
1593 ge : whether $a is greater than $b or equal to $b.
1594
1595=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1596
3756e7ca 1597-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1598
1599Returns a sort key.
1600
1601You compare the sort keys using a binary comparison
1602and get the result of the comparison of the strings using UCA.
1603
5398038e 1604 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607
JH
1605
1606 is equivalent to
1607
5398038e 1608 $Collator->cmp($a, $b)
45394607 1609
a7fbee98
JH
1610=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1611
3756e7ca
RGS
1612Converts a sorting key into its representation form.
1613If C<UCA_Version> is 8, the output is slightly different.
1614
a7fbee98
JH
1615 use Unicode::Collate;
1616 my $c = Unicode::Collate->new();
1617 print $c->viewSortKey("Perl"),"\n";
1618
0116f5dc
JH
1619 # output:
1620 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1621 # Level 1 Level 2 Level 3 Level 4
1622
4d36a948
ST
1623=back
1624
1625=head2 Methods for Searching
d16e9e3d 1626
4d36a948
ST
1627The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1628like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1629but they are not aware of any pattern, but only a literal substring.
1630
19265284
CBW
1631B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1632for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1633C<subst>, C<gsubst>) is croaked, as the position and the length might
1634differ from those on the specified string.
1635
1636C<rearrange> and C<hangul_terminator> parameters are neglected.
1637C<katakana_before_hiragana> and C<upper_before_lower> don't affect
1638matching and searching, as it doesn't matter whether greater or lesser.
1639
4d36a948
ST
1640=over 4
1641
1642=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1643
1644=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d
JH
1645
1646If C<$substring> matches a part of C<$string>, returns
1647the position of the first occurrence of the matching part in scalar context;
1648in list context, returns a two-element list of
1649the position and the length of the matching part.
1650
d16e9e3d
JH
1651If C<$substring> does not match any part of C<$string>,
1652returns C<-1> in scalar context and
1653an empty list in list context.
1654
1655e.g. you say
1656
5398038e 1657 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948
ST
1658 # (normalization => undef) is REQUIRED.
1659 my $str = "Ich muß studieren Perl.";
1660 my $sub = "MÜSS";
d16e9e3d 1661 my $match;
a7fbee98 1662 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1663 $match = substr($str, $pos, $len);
d16e9e3d
JH
1664 }
1665
4d36a948 1666and get C<"muß"> in C<$match> since C<"muß">
3756e7ca 1667is primary equal to C<"MÜSS">.
4d36a948
ST
1668
1669=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1670
1671=item C<($match) = $Collator-E<gt>match($string, $substring)>
1672
1673If C<$substring> matches a part of C<$string>, in scalar context, returns
1674B<a reference to> the first occurrence of the matching part
1675(C<$match_ref> is always true if matches,
1676since every reference is B<true>);
1677in list context, returns the first occurrence of the matching part.
1678
1679If C<$substring> does not match any part of C<$string>,
1680returns C<undef> in scalar context and
1681an empty list in list context.
1682
1683e.g.
1684
1685 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1686 print "matches [$$match_ref].\n";
1687 } else {
1688 print "doesn't match.\n";
1689 }
1690
3756e7ca 1691 or
4d36a948
ST
1692
1693 if (($match) = $Collator->match($str, $sub)) { # list context
1694 print "matches [$match].\n";
1695 } else {
1696 print "doesn't match.\n";
1697 }
1698
1699=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1700
1701If C<$substring> matches a part of C<$string>, returns
1702all the matching parts (or matching count in scalar context).
1703
1704If C<$substring> does not match any part of C<$string>,
1705returns an empty list.
1706
1707=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1708
1709If C<$substring> matches a part of C<$string>,
1710the first occurrence of the matching part is replaced by C<$replacement>
19265284 1711(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
4d36a948
ST
1712
1713C<$replacement> can be a C<CODEREF>,
1714taking the matching part as an argument,
1715and returning a string to replace the matching part
1716(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1717
1718=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1719
1720If C<$substring> matches a part of C<$string>,
19265284
CBW
1721all the occurrences of the matching part are replaced by C<$replacement>
1722(C<$string> is modified) and C<$count> is returned.
4d36a948
ST
1723
1724C<$replacement> can be a C<CODEREF>,
1725taking the matching part as an argument,
1726and returning a string to replace the matching part
1727(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1728
1729e.g.
1730
1731 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1732 # (normalization => undef) is REQUIRED.
19265284 1733 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
4d36a948
ST
1734 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1735
19265284 1736 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
4d36a948 1737 # i.e., all the camels are made bold-faced.
d16e9e3d 1738
19265284
CBW
1739 Examples: levels and ignore_level2 - what does camel match?
1740 ---------------------------------------------------------------------------
1741 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l
1742 -----------------------|---------------------------------------------------
1743 1 false | yes yes yes yes yes
1744 2 false | yes yes no yes yes
1745 3 false | yes no no yes yes
1746 4 false | yes no no no yes
1747 -----------------------|---------------------------------------------------
1748 1 true | yes yes yes yes yes
1749 2 true | yes yes yes yes yes
1750 3 true | yes no yes yes yes
1751 4 true | yes no yes no yes
1752 ---------------------------------------------------------------------------
1753 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1754 at any level.
1755
45394607
JH
1756=back
1757
3164dd77
ST
1758=head2 Other Methods
1759
1760=over 4
1761
0116f5dc
JH
1762=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1763
68adb2b0
CBW
1764=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1765
19265284 1766Changes the value of specified keys and returns the changed part.
0116f5dc
JH
1767
1768 $Collator = Unicode::Collate->new(level => 4);
1769
1770 $Collator->eq("perl", "PERL"); # false
1771
1772 %old = $Collator->change(level => 2); # returns (level => 4).
1773
1774 $Collator->eq("perl", "PERL"); # true
1775
1776 $Collator->change(%old); # returns (level => 2).
1777
1778 $Collator->eq("perl", "PERL"); # false
1779
1780Not all C<(key,value)>s are allowed to be changed.
1781See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1782
1783In the scalar context, returns the modified collator
1784(but it is B<not> a clone from the original).
1785
1786 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1787
1788 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1789
1790 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1791
91ae00cb 1792=item C<$version = $Collator-E<gt>version()>
3164dd77 1793
91ae00cb
NC
1794Returns the version number (a string) of the Unicode Standard
1795which the C<table> file used by the collator object is based on.
1796If the table does not include a version line (starting with C<@version>),
1797returns C<"unknown">.
1798
1799=item C<UCA_Version()>
3164dd77 1800
7b98b857
CBW
1801Returns the revision number of UTS #10 this module consults,
1802that should correspond with the DUCET incorporated.
3164dd77 1803
91ae00cb
NC
1804=item C<Base_Unicode_Version()>
1805
7b98b857
CBW
1806Returns the version number of UTS #10 this module consults,
1807that should correspond with the DUCET incorporated.
3164dd77
ST
1808
1809=back
1810
3756e7ca
RGS
1811=head1 EXPORT
1812
1813No method will be exported.
45394607 1814
0d50d293
RGS
1815=head1 INSTALL
1816
1817Though this module can be used without any C<table> file,
1818to use this module easily, it is recommended to install a table file
1819in the UCA format, by copying it under the directory
1820<a place in @INC>/Unicode/Collate.
1821
6d24ed10
SP
1822The most preferable one is "The Default Unicode Collation Element Table"
1823(aka DUCET), available from the Unicode Consortium's website:
0d50d293
RGS
1824
1825 http://www.unicode.org/Public/UCA/
1826
1827 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1828
1829If DUCET is not installed, it is recommended to copy the file
1830from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1831to <a place in @INC>/Unicode/Collate/allkeys.txt
1832manually.
1833
3756e7ca
RGS
1834=head1 CAVEATS
1835
1836=over 4
45394607 1837
3756e7ca 1838=item Normalization
45394607 1839
3756e7ca
RGS
1840Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1841module (see L<Unicode::Normalize>).
45394607 1842
5398038e 1843If you need not it (say, in the case when you need not
45394607
JH
1844handle any combining characters),
1845assign C<normalization =E<gt> undef> explicitly.
1846
4d36a948 1847-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1848
3756e7ca 1849=item Conformance Test
0116f5dc 1850
10d7ec48
NC
1851The Conformance Test for the UCA is available
1852under L<http://www.unicode.org/Public/UCA/>.
0116f5dc
JH
1853
1854For F<CollationTest_SHIFTED.txt>,
1855a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1856for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 1857C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 1858
4d36a948 1859B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1860
3756e7ca
RGS
1861=back
1862
0d50d293 1863=head1 AUTHOR, COPYRIGHT AND LICENSE
45394607 1864
0d50d293 1865The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
f58b9ef1 1866<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2011,
0d50d293 1867SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1868
0d50d293
RGS
1869This module is free software; you can redistribute it and/or
1870modify it under the same terms as Perl itself.
45394607 1871
00e00351 1872The file Unicode/Collate/allkeys.txt was copied verbatim
7b98b857
CBW
1873from L<http://www.unicode.org/Public/UCA/6.0.0/allkeys.txt>.
1874This file is Copyright (c) 1991-2010 Unicode, Inc. All rights reserved.
6d24ed10 1875Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
45394607
JH
1876
1877=head1 SEE ALSO
1878
1879=over 4
1880
91ae00cb
NC
1881=item Unicode Collation Algorithm - UTS #10
1882
1883L<http://www.unicode.org/reports/tr10/>
1884
1885=item The Default Unicode Collation Element Table (DUCET)
1886
10d7ec48 1887L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 1888
91ae00cb 1889=item The conformance test for the UCA
45394607 1890
10d7ec48 1891L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 1892
10d7ec48 1893L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 1894
91ae00cb 1895=item Hangul Syllable Type
0116f5dc 1896
10d7ec48 1897L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 1898
91ae00cb 1899=item Unicode Normalization Forms - UAX #15
a7fbee98 1900
91ae00cb 1901L<http://www.unicode.org/reports/tr15/>
a7fbee98 1902
aa7758f7
CBW
1903=item Unicode Locale Data Markup Language (LDML) - UTS #35
1904
1905L<http://www.unicode.org/reports/tr35/>
1906
45394607
JH
1907=back
1908
1909=cut