This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Merge branch 'blead' of ssh://perl5.git.perl.org/gitroot/perl into blead
[perl5.git] / cpan / Unicode-Collate / Collate.pm
CommitLineData
45394607
JH
1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c
JH
6 }
7}
8
45394607
JH
9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
10d7ec48
NC
15no warnings 'utf8';
16
16e0d770 17our $VERSION = '0.52_01';
45394607
JH
18our $PACKAGE = __PACKAGE__;
19
e7f779c8
RGS
20my @Path = qw(Unicode Collate);
21my $KeyFile = "allkeys.txt";
45394607 22
4d36a948
TS
23# Perl's boolean
24use constant TRUE => 1;
25use constant FALSE => "";
26use constant NOMATCHPOS => -1;
27
28# A coderef to get combining class imported from Unicode::Normalize
29# (i.e. \&Unicode::Normalize::getCombinClass).
30# This is also used as a HAS_UNICODE_NORMALIZE flag.
e7f779c8 31my $CVgetCombinClass;
4d36a948 32
9f1f04a1
RGS
33# Supported Levels
34use constant MinLevel => 1;
35use constant MaxLevel => 4;
36
4d36a948 37# Minimum weights at level 2 and 3, respectively
9f1f04a1
RGS
38use constant Min2Wt => 0x20;
39use constant Min3Wt => 0x02;
4d36a948
TS
40
41# Shifted weight at 4th level
9f1f04a1 42use constant Shift4Wt => 0xFFFF;
4d36a948 43
4d36a948
TS
44# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45# PROBLEM: The Default Unicode Collation Element Table
46# has weights over 0xFFFF at the 4th level.
47# The tie-breaking in the variable weights
48# other than "shift" (as well as "shift-trimmed") is unreliable.
49use constant VCE_TEMPLATE => 'Cn4';
50
4d36a948
TS
51# A sort key: 16-bit weights
52# See also the PROBLEM on VCE_TEMPLATE above.
53use constant KEY_TEMPLATE => 'n*';
54
55# Level separator in a sort key:
56# i.e. pack(KEY_TEMPLATE, 0)
57use constant LEVEL_SEP => "\0\0";
58
59# As Unicode code point separator for hash keys.
60# A joined code point string (denoted by JCPS below)
61# like "65;768" is used for internal processing
62# instead of Perl's Unicode string like "\x41\x{300}",
63# as the native code point is different from the Unicode code point
64# on EBCDIC platform.
65# This character must not be included in any stringified
66# representation of an integer.
67use constant CODE_SEP => ';';
68
69# boolean values of variable weights
0116f5dc
JH
70use constant NON_VAR => 0; # Non-Variable character
71use constant VAR => 1; # Variable character
3164dd77 72
91ae00cb
NC
73# specific code points
74use constant Hangul_LBase => 0x1100;
75use constant Hangul_LIni => 0x1100;
76use constant Hangul_LFin => 0x1159;
77use constant Hangul_LFill => 0x115F;
78use constant Hangul_VBase => 0x1161;
3756e7ca 79use constant Hangul_VIni => 0x1160; # from Vowel Filler
91ae00cb 80use constant Hangul_VFin => 0x11A2;
3756e7ca 81use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
91ae00cb
NC
82use constant Hangul_TIni => 0x11A8;
83use constant Hangul_TFin => 0x11F9;
84use constant Hangul_TCount => 28;
85use constant Hangul_NCount => 588;
86use constant Hangul_SBase => 0xAC00;
87use constant Hangul_SIni => 0xAC00;
88use constant Hangul_SFin => 0xD7A3;
89use constant CJK_UidIni => 0x4E00;
90use constant CJK_UidFin => 0x9FA5;
3756e7ca 91use constant CJK_UidF41 => 0x9FBB;
91ae00cb
NC
92use constant CJK_ExtAIni => 0x3400;
93use constant CJK_ExtAFin => 0x4DB5;
94use constant CJK_ExtBIni => 0x20000;
95use constant CJK_ExtBFin => 0x2A6D6;
96use constant BMP_Max => 0xFFFF;
97
4d36a948 98# Logical_Order_Exception in PropList.txt
e7f779c8 99my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 100
3756e7ca 101sub UCA_Version { "14" }
a7fbee98 102
3756e7ca 103sub Base_Unicode_Version { "4.1.0" }
a7fbee98 104
9f1f04a1
RGS
105######
106
9f1f04a1 107sub pack_U {
ae6aa562 108 return pack('U*', @_);
9f1f04a1
RGS
109}
110
111sub unpack_U {
3756e7ca 112 return unpack('U*', shift(@_).pack('U*'));
9f1f04a1
RGS
113}
114
115######
116
91ae00cb
NC
117my (%VariableOK);
118@VariableOK{ qw/
0116f5dc 119 blanked non-ignorable shifted shift-trimmed
91ae00cb 120 / } = (); # keys lowercased
0116f5dc
JH
121
122our @ChangeOK = qw/
123 alternate backwards level normalization rearrange
124 katakana_before_hiragana upper_before_lower
125 overrideHangul overrideCJK preprocess UCA_Version
91ae00cb 126 hangul_terminator variable
0116f5dc
JH
127 /;
128
129our @ChangeNG = qw/
91ae00cb
NC
130 entry mapping table maxlength
131 ignoreChar ignoreName undefChar undefName variableTable
0116f5dc 132 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
e7f779c8 133 derivCode normCode rearrangeHash
9f1f04a1 134 backwardsFlag
0116f5dc 135 /;
9f1f04a1
RGS
136# The hash key 'ignored' is deleted at v 0.21.
137# The hash key 'isShift' is deleted at v 0.23.
138# The hash key 'combining' is deleted at v 0.24.
91ae00cb 139# The hash key 'entries' is deleted at v 0.30.
e7f779c8 140# The hash key 'L3_ignorable' is deleted at v 0.40.
91ae00cb
NC
141
142sub version {
143 my $self = shift;
144 return $self->{versionTable} || 'unknown';
145}
0116f5dc
JH
146
147my (%ChangeOK, %ChangeNG);
148@ChangeOK{ @ChangeOK } = ();
149@ChangeNG{ @ChangeNG } = ();
150
151sub change {
152 my $self = shift;
153 my %hash = @_;
154 my %old;
91ae00cb
NC
155 if (exists $hash{variable} && exists $hash{alternate}) {
156 delete $hash{alternate};
157 }
158 elsif (!exists $hash{variable} && exists $hash{alternate}) {
159 $hash{variable} = $hash{alternate};
160 }
0116f5dc
JH
161 foreach my $k (keys %hash) {
162 if (exists $ChangeOK{$k}) {
163 $old{$k} = $self->{$k};
164 $self->{$k} = $hash{$k};
165 }
166 elsif (exists $ChangeNG{$k}) {
167 croak "change of $k via change() is not allowed!";
168 }
169 # else => ignored
170 }
3756e7ca 171 $self->checkCollator();
0116f5dc
JH
172 return wantarray ? %old : $self;
173}
a7fbee98 174
9f1f04a1
RGS
175sub _checkLevel {
176 my $level = shift;
abd1ec54
NC
177 my $key = shift; # 'level' or 'backwards'
178 MinLevel <= $level or croak sprintf
179 "Illegal level %d (in value for key '%s') lower than %d.",
180 $level, $key, MinLevel;
181 $level <= MaxLevel or croak sprintf
182 "Unsupported level %d (in value for key '%s') higher than %d.",
183 $level, $key, MaxLevel;
9f1f04a1
RGS
184}
185
91ae00cb
NC
186my %DerivCode = (
187 8 => \&_derivCE_8,
188 9 => \&_derivCE_9,
189 11 => \&_derivCE_9, # 11 == 9
3756e7ca 190 14 => \&_derivCE_14,
91ae00cb
NC
191);
192
0116f5dc
JH
193sub checkCollator {
194 my $self = shift;
9f1f04a1 195 _checkLevel($self->{level}, "level");
a7fbee98 196
91ae00cb
NC
197 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
198 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 199
91ae00cb 200 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 201 $self->{alternateTable} || 'shifted';
91ae00cb
NC
202 $self->{variable} = $self->{alternate} = lc($self->{variable});
203 exists $VariableOK{ $self->{variable} }
3756e7ca 204 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
0116f5dc 205
9f1f04a1
RGS
206 if (! defined $self->{backwards}) {
207 $self->{backwardsFlag} = 0;
208 }
209 elsif (! ref $self->{backwards}) {
210 _checkLevel($self->{backwards}, "backwards");
211 $self->{backwardsFlag} = 1 << $self->{backwards};
212 }
213 else {
214 my %level;
215 $self->{backwardsFlag} = 0;
216 for my $b (@{ $self->{backwards} }) {
217 _checkLevel($b, "backwards");
218 $level{$b} = 1;
219 }
220 for my $v (sort keys %level) {
221 $self->{backwardsFlag} += 1 << $v;
222 }
223 }
0116f5dc 224
91ae00cb
NC
225 defined $self->{rearrange} or $self->{rearrange} = [];
226 ref $self->{rearrange}
227 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc
JH
228
229 # keys of $self->{rearrangeHash} are $self->{rearrange}.
230 $self->{rearrangeHash} = undef;
231
232 if (@{ $self->{rearrange} }) {
233 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
234 }
235
236 $self->{normCode} = undef;
a7fbee98
JH
237
238 if (defined $self->{normalization}) {
239 eval { require Unicode::Normalize };
91ae00cb 240 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 241
91ae00cb 242 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 243
91ae00cb
NC
244 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
245 $self->{normCode} = \&Unicode::Normalize::NFD;
246 }
247 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f
RGS
248 my $norm = $self->{normalization};
249 $self->{normCode} = sub {
1d2654e1
JH
250 Unicode::Normalize::normalize($norm, shift);
251 };
06c8fc8f
RGS
252 eval { $self->{normCode}->("") }; # try
253 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 254 }
a7fbee98 255 }
0116f5dc
JH
256 return;
257}
258
259sub new
260{
261 my $class = shift;
262 my $self = bless { @_ }, $class;
45394607 263
a7fbee98 264 # If undef is passed explicitly, no file is read.
0116f5dc 265 $self->{table} = $KeyFile if ! exists $self->{table};
3756e7ca 266 $self->read_table() if defined $self->{table};
905aa9f0 267
a7fbee98 268 if ($self->{entry}) {
e7f779c8
RGS
269 while ($self->{entry} =~ /([^\n]+)/g) {
270 $self->parseEntry($1);
271 }
a7fbee98 272 }
905aa9f0 273
9f1f04a1 274 $self->{level} ||= MaxLevel;
0116f5dc 275 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 276
abd1ec54 277 $self->{overrideHangul} = FALSE
0116f5dc 278 if ! exists $self->{overrideHangul};
abd1ec54 279 $self->{overrideCJK} = FALSE
0116f5dc 280 if ! exists $self->{overrideCJK};
06c8fc8f 281 $self->{normalization} = 'NFD'
0116f5dc 282 if ! exists $self->{normalization};
3756e7ca
RGS
283 $self->{rearrange} = $self->{rearrangeTable} ||
284 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
a7fbee98 285 if ! exists $self->{rearrange};
0116f5dc
JH
286 $self->{backwards} = $self->{backwardsTable}
287 if ! exists $self->{backwards};
a7fbee98 288
3756e7ca 289 $self->checkCollator();
a7fbee98
JH
290
291 return $self;
292}
905aa9f0
TS
293
294sub read_table {
a7fbee98 295 my $self = shift;
a7fbee98 296
e7f779c8
RGS
297 my($f, $fh);
298 foreach my $d (@INC) {
299 $f = File::Spec->catfile($d, @Path, $self->{table});
300 last if open($fh, $f);
301 $f = undef;
302 }
0d50d293
RGS
303 if (!defined $f) {
304 $f = File::Spec->catfile(@Path, $self->{table});
305 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
306 }
a7fbee98 307
6d24ed10
SP
308 while (my $line = <$fh>) {
309 next if $line =~ /^\s*#/;
310 unless ($line =~ s/^\s*\@//) {
311 $self->parseEntry($line);
a7fbee98
JH
312 next;
313 }
abd1ec54 314
6d24ed10
SP
315 # matched ^\s*\@
316 if ($line =~ /^version\s*(\S*)/) {
abd1ec54
NC
317 $self->{versionTable} ||= $1;
318 }
6d24ed10 319 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
abd1ec54
NC
320 $self->{variableTable} ||= $1;
321 }
6d24ed10 322 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
abd1ec54
NC
323 $self->{alternateTable} ||= $1;
324 }
6d24ed10 325 elsif ($line =~ /^backwards\s+(\S*)/) {
abd1ec54
NC
326 push @{ $self->{backwardsTable} }, $1;
327 }
6d24ed10 328 elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
abd1ec54
NC
329 push @{ $self->{forwardsTable} }, $1;
330 }
6d24ed10 331 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
abd1ec54
NC
332 push @{ $self->{rearrangeTable} }, _getHexArray($1);
333 }
45394607 334 }
e7f779c8 335 close $fh;
45394607
JH
336}
337
905aa9f0 338
45394607
JH
339##
340## get $line, parse it, and write an entry in $self
341##
342sub parseEntry
343{
a7fbee98
JH
344 my $self = shift;
345 my $line = shift;
4d36a948 346 my($name, $entry, @uv, @key);
a7fbee98
JH
347
348 return if $line !~ /^\s*[0-9A-Fa-f]/;
349
350 # removes comment and gets name
351 $name = $1
352 if $line =~ s/[#%]\s*(.*)//;
353 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
354
355 # gets element
356 my($e, $k) = split /;/, $line;
357 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
358 if ! $k;
359
4d36a948
TS
360 @uv = _getHexArray($e);
361 return if !@uv;
362
363 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 364
4d36a948 365 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 366 my $ele = pack_U(@uv);
a7fbee98 367
4d36a948
TS
368 # regarded as if it were not entried in the table
369 return
370 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 371
4d36a948
TS
372 # replaced as completely ignorable
373 $k = '[.0000.0000.0000.0000]'
374 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 375 }
0116f5dc 376
4d36a948
TS
377 # replaced as completely ignorable
378 $k = '[.0000.0000.0000.0000]'
379 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
380
4c843366 381 my $is_L3_ignorable = TRUE;
4d36a948 382
caffd4cf
TS
383 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
384 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948
TS
385 my @wt = _getHexArray($arr);
386 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 387 $is_L3_ignorable = FALSE
3756e7ca
RGS
388 if $wt[0] || $wt[1] || $wt[2];
389 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
390 # is completely ignorable.
4c843366
JH
391 # For expansion, an entry $is_L3_ignorable
392 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 393 }
caffd4cf 394
e7f779c8 395 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
caffd4cf 396
91ae00cb
NC
397 if (@uv > 1) {
398 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
399 and $self->{maxlength}{$uv[0]} = @uv;
400 }
45394607
JH
401}
402
9f1f04a1 403
45394607 404##
abd1ec54 405## VCE = _varCE(variable term, VCE)
45394607 406##
abd1ec54 407sub _varCE
45394607 408{
abd1ec54
NC
409 my $vbl = shift;
410 my $vce = shift;
411 if ($vbl eq 'non-ignorable') {
412 return $vce;
413 }
414 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
415
416 if ($var) {
417 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
418 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
419 }
420 elsif ($vbl eq 'blanked') {
421 return $vce;
422 }
423 else {
424 return pack(VCE_TEMPLATE, $var, @wt[0..2],
425 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
426 }
45394607
JH
427}
428
45394607
JH
429sub viewSortKey
430{
a7fbee98 431 my $self = shift;
9f1f04a1
RGS
432 $self->visualizeSortKey($self->getSortKey(@_));
433}
0116f5dc 434
9f1f04a1
RGS
435sub visualizeSortKey
436{
437 my $self = shift;
438 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 439
9f1f04a1 440 if ($self->{UCA_Version} <= 8) {
0116f5dc
JH
441 $view =~ s/ ?0000 ?/|/g;
442 } else {
443 $view =~ s/\b0000\b/|/g;
444 }
a7fbee98 445 return "[$view]";
45394607
JH
446}
447
d16e9e3d 448
45394607 449##
91ae00cb
NC
450## arrayref of JCPS = splitEnt(string to be collated)
451## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
45394607 452##
91ae00cb 453sub splitEnt
45394607 454{
a7fbee98 455 my $self = shift;
4d36a948
TS
456 my $wLen = $_[1];
457
a7fbee98 458 my $code = $self->{preprocess};
0116f5dc 459 my $norm = $self->{normCode};
91ae00cb 460 my $map = $self->{mapping};
a7fbee98
JH
461 my $max = $self->{maxlength};
462 my $reH = $self->{rearrangeHash};
3756e7ca 463 my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
a7fbee98 464
4d36a948 465 my ($str, @buf);
a7fbee98 466
4d36a948
TS
467 if ($wLen) {
468 $code and croak "Preprocess breaks character positions. "
469 . "Don't use with index(), match(), etc.";
470 $norm and croak "Normalization breaks character positions. "
471 . "Don't use with index(), match(), etc.";
472 $str = $_[0];
473 }
474 else {
475 $str = $_[0];
476 $str = &$code($str) if ref $code;
477 $str = &$norm($str) if ref $norm;
478 }
a7fbee98 479
4d36a948 480 # get array of Unicode code point of string.
9f1f04a1 481 my @src = unpack_U($str);
4d36a948
TS
482
483 # rearrangement:
484 # Character positions are not kept if rearranged,
485 # then neglected if $wLen is true.
486 if ($reH && ! $wLen) {
a7fbee98
JH
487 for (my $i = 0; $i < @src; $i++) {
488 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
489 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
490 $i++;
491 }
492 }
45394607 493 }
45394607 494
3756e7ca 495 # remove a code point marked as a completely ignorable.
abd1ec54
NC
496 for (my $i = 0; $i < @src; $i++) {
497 $src[$i] = undef
e7f779c8
RGS
498 if _isIllegal($src[$i]) || ($ver9 &&
499 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
0116f5dc
JH
500 }
501
a7fbee98 502 for (my $i = 0; $i < @src; $i++) {
91ae00cb 503 my $jcps = $src[$i];
3756e7ca
RGS
504
505 # skip removed code point
506 if (! defined $jcps) {
507 if ($wLen && @buf) {
508 $buf[-1][2] = $i + 1;
509 }
510 next;
511 }
512
abd1ec54 513 my $i_orig = $i;
4d36a948 514
3756e7ca
RGS
515 # find contraction
516 if ($max->{$jcps}) {
91ae00cb
NC
517 my $temp_jcps = $jcps;
518 my $jcpsLen = 1;
519 my $maxLen = $max->{$jcps};
4d36a948 520
91ae00cb 521 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 522 next if ! defined $src[$p];
91ae00cb
NC
523 $temp_jcps .= CODE_SEP . $src[$p];
524 $jcpsLen++;
525 if ($map->{$temp_jcps}) {
526 $jcps = $temp_jcps;
4d36a948
TS
527 $i = $p;
528 }
529 }
4d36a948 530
06c8fc8f
RGS
531 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
532 # This process requires Unicode::Normalize.
91ae00cb 533 # If "normalization" is undef, here should be skipped *always*
06c8fc8f
RGS
534 # (in spite of bool value of $CVgetCombinClass),
535 # since canonical ordering cannot be expected.
536 # Blocked combining character should not be contracted.
537
538 if ($self->{normalization})
539 # $self->{normCode} is false in the case of "prenormalized".
540 {
541 my $preCC = 0;
542 my $curCC = 0;
543
544 for (my $p = $i + 1; $p < @src; $p++) {
545 next if ! defined $src[$p];
546 $curCC = $CVgetCombinClass->($src[$p]);
547 last unless $curCC;
548 my $tail = CODE_SEP . $src[$p];
91ae00cb
NC
549 if ($preCC != $curCC && $map->{$jcps.$tail}) {
550 $jcps .= $tail;
06c8fc8f
RGS
551 $src[$p] = undef;
552 } else {
553 $preCC = $curCC;
554 }
4d36a948 555 }
a7fbee98 556 }
a7fbee98
JH
557 }
558
3756e7ca
RGS
559 # skip completely ignorable
560 if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
561 if ($wLen && @buf) {
562 $buf[-1][2] = $i + 1;
a7fbee98 563 }
3756e7ca 564 next;
a7fbee98 565 }
4d36a948 566
91ae00cb 567 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 568 }
4d36a948 569 return \@buf;
d16e9e3d 570}
45394607 571
d16e9e3d
JH
572
573##
abd1ec54 574## list of VCE = getWt(JCPS)
d16e9e3d
JH
575##
576sub getWt
577{
a7fbee98 578 my $self = shift;
91ae00cb 579 my $u = shift;
abd1ec54 580 my $vbl = $self->{variable};
91ae00cb 581 my $map = $self->{mapping};
0116f5dc 582 my $der = $self->{derivCode};
a7fbee98 583
91ae00cb 584 return if !defined $u;
abd1ec54 585 return map(_varCE($vbl, $_), @{ $map->{$u} })
91ae00cb 586 if $map->{$u};
a7fbee98 587
91ae00cb
NC
588 # JCPS must not be a contraction, then it's a code point.
589 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1
JH
590 my $hang = $self->{overrideHangul};
591 my @hangulCE;
592 if ($hang) {
593 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
594 }
595 elsif (!defined $hang) {
596 @hangulCE = $der->($u);
597 }
598 else {
599 my $max = $self->{maxlength};
600 my @decH = _decompHangul($u);
601
602 if (@decH == 2) {
603 my $contract = join(CODE_SEP, @decH);
91ae00cb 604 @decH = ($contract) if $map->{$contract};
1d2654e1
JH
605 } else { # must be <@decH == 3>
606 if ($max->{$decH[0]}) {
607 my $contract = join(CODE_SEP, @decH);
91ae00cb 608 if ($map->{$contract}) {
1d2654e1
JH
609 @decH = ($contract);
610 } else {
611 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 612 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1
JH
613 }
614 # even if V's ignorable, LT contraction is not supported.
615 # If such a situatution were required, NFD should be used.
616 }
617 if (@decH == 3 && $max->{$decH[1]}) {
618 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 619 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1
JH
620 }
621 }
622
623 @hangulCE = map({
91ae00cb 624 $map->{$_} ? @{ $map->{$_} } : $der->($_);
1d2654e1
JH
625 } @decH);
626 }
abd1ec54 627 return map _varCE($vbl, $_), @hangulCE;
a7fbee98 628 }
3756e7ca 629 elsif (_isUIdeo($u, $self->{UCA_Version})) {
1d2654e1 630 my $cjk = $self->{overrideCJK};
abd1ec54 631 return map _varCE($vbl, $_),
0116f5dc 632 $cjk
4d36a948 633 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
3756e7ca
RGS
634 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
635 ? _uideoCE_8($u)
0116f5dc 636 : $der->($u);
a7fbee98
JH
637 }
638 else {
abd1ec54 639 return map _varCE($vbl, $_), $der->($u);
a7fbee98 640 }
d16e9e3d
JH
641}
642
d16e9e3d
JH
643
644##
645## string sortkey = getSortKey(string arg)
646##
647sub getSortKey
648{
a7fbee98
JH
649 my $self = shift;
650 my $lev = $self->{level};
91ae00cb 651 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
3756e7ca
RGS
652 my $v2i = $self->{UCA_Version} >= 9 &&
653 $self->{variable} ne 'non-ignorable';
91ae00cb 654
abd1ec54 655 my @buf; # weight arrays
91ae00cb
NC
656 if ($self->{hangul_terminator}) {
657 my $preHST = '';
658 foreach my $jcps (@$rEnt) {
659 # weird things like VL, TL-contraction are not considered!
660 my $curHST = '';
661 foreach my $u (split /;/, $jcps) {
662 $curHST .= getHST($u);
663 }
664 if ($preHST && !$curHST || # hangul before non-hangul
665 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
666 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
667 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
0116f5dc 668
abd1ec54 669 push @buf, $self->getWtHangulTerm();
91ae00cb
NC
670 }
671 $preHST = $curHST;
672
abd1ec54 673 push @buf, $self->getWt($jcps);
91ae00cb
NC
674 }
675 $preHST # end at hangul
abd1ec54 676 and push @buf, $self->getWtHangulTerm();
91ae00cb
NC
677 }
678 else {
679 foreach my $jcps (@$rEnt) {
abd1ec54 680 push @buf, $self->getWt($jcps);
91ae00cb
NC
681 }
682 }
683
abd1ec54
NC
684 # make sort key
685 my @ret = ([],[],[],[]);
686 my $last_is_variable;
687
688 foreach my $vwt (@buf) {
689 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
3756e7ca
RGS
690
691 # "Ignorable (L1, L2) after Variable" since track. v. 9
abd1ec54
NC
692 if ($v2i) {
693 if ($var) {
694 $last_is_variable = TRUE;
695 }
696 elsif (!$wt[0]) { # ignorable
4d36a948 697 next if $last_is_variable;
abd1ec54
NC
698 }
699 else {
700 $last_is_variable = FALSE;
0116f5dc
JH
701 }
702 }
abd1ec54
NC
703 foreach my $v (0..$lev-1) {
704 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
a7fbee98
JH
705 }
706 }
45394607 707
a7fbee98
JH
708 # modification of tertiary weights
709 if ($self->{upper_before_lower}) {
6d24ed10
SP
710 foreach my $w (@{ $ret[2] }) {
711 if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
712 elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
713 elsif ($w == 0x1C) { $w += 1 } # square upper
714 elsif ($w == 0x1D) { $w -= 1 } # square lower
a7fbee98 715 }
45394607 716 }
a7fbee98 717 if ($self->{katakana_before_hiragana}) {
6d24ed10
SP
718 foreach my $w (@{ $ret[2] }) {
719 if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
720 elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
a7fbee98 721 }
45394607 722 }
9f1f04a1
RGS
723
724 if ($self->{backwardsFlag}) {
725 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
726 if ($self->{backwardsFlag} & (1 << $v)) {
727 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
728 }
729 }
730 }
731
4d36a948 732 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607
JH
733}
734
735
736##
d16e9e3d 737## int compare = cmp(string a, string b)
45394607 738##
5398038e
TS
739sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
740sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
741sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
742sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
743sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
744sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
745sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607
JH
746
747##
d16e9e3d 748## list[strings] sorted = sort(list[strings] arg)
45394607 749##
a7fbee98
JH
750sub sort {
751 my $obj = shift;
752 return
753 map { $_->[1] }
754 sort{ $a->[0] cmp $b->[0] }
755 map [ $obj->getSortKey($_), $_ ], @_;
45394607
JH
756}
757
0116f5dc 758
3756e7ca
RGS
759sub _derivCE_14 {
760 my $u = shift;
761 my $base =
762 (CJK_UidIni <= $u && $u <= CJK_UidF41)
763 ? 0xFB40 : # CJK
764 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
765 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
766 ? 0xFB80 # CJK ext.
767 : 0xFBC0; # others
768
769 my $aaaa = $base + ($u >> 15);
770 my $bbbb = ($u & 0x7FFF) | 0x8000;
771 return
772 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
773 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
774}
775
4d36a948 776sub _derivCE_9 {
0116f5dc
JH
777 my $u = shift;
778 my $base =
91ae00cb 779 (CJK_UidIni <= $u && $u <= CJK_UidFin)
4d36a948 780 ? 0xFB40 : # CJK
91ae00cb
NC
781 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
782 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
4d36a948
TS
783 ? 0xFB80 # CJK ext.
784 : 0xFBC0; # others
0116f5dc
JH
785
786 my $aaaa = $base + ($u >> 15);
787 my $bbbb = ($u & 0x7FFF) | 0x8000;
788 return
9f1f04a1
RGS
789 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
790 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc
JH
791}
792
4d36a948 793sub _derivCE_8 {
0116f5dc
JH
794 my $code = shift;
795 my $aaaa = 0xFF80 + ($code >> 15);
796 my $bbbb = ($code & 0x7FFF) | 0x8000;
797 return
4d36a948
TS
798 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
799 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607
JH
800}
801
3756e7ca
RGS
802sub _uideoCE_8 {
803 my $u = shift;
804 return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
805}
806
807sub _isUIdeo {
808 my ($u, $uca_vers) = @_;
809 return(
810 (CJK_UidIni <= $u &&
811 ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin)))
812 ||
813 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
814 ||
815 (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
816 );
817}
818
91ae00cb 819
abd1ec54 820sub getWtHangulTerm {
91ae00cb 821 my $self = shift;
abd1ec54
NC
822 return _varCE($self->{variable},
823 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
91ae00cb
NC
824}
825
826
45394607
JH
827##
828## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
829##
a7fbee98 830sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 831
a7fbee98 832#
4d36a948 833# $code *must* be in Hangul syllable.
a7fbee98
JH
834# Check it before you enter here.
835#
5398038e
TS
836sub _decompHangul {
837 my $code = shift;
3756e7ca
RGS
838 my $si = $code - Hangul_SBase;
839 my $li = int( $si / Hangul_NCount);
840 my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
841 my $ti = $si % Hangul_TCount;
5398038e 842 return (
3756e7ca
RGS
843 Hangul_LBase + $li,
844 Hangul_VBase + $vi,
845 $ti ? (Hangul_TBase + $ti) : (),
5398038e 846 );
45394607
JH
847}
848
10d7ec48 849sub _isIllegal {
4d36a948
TS
850 my $code = shift;
851 return ! defined $code # removed
852 || ($code < 0 || 0x10FFFF < $code) # out of range
853 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
854 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
855 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
856 ;
857}
858
91ae00cb
NC
859# Hangul Syllable Type
860sub getHST {
861 my $u = shift;
862 return
863 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
864 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
865 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" :
866 Hangul_SIni <= $u && $u <= Hangul_SFin ?
867 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
868}
869
4d36a948
TS
870
871##
872## bool _nonIgnorAtLevel(arrayref weights, int level)
873##
874sub _nonIgnorAtLevel($$)
875{
876 my $wt = shift;
877 return if ! defined $wt;
878 my $lv = shift;
9f1f04a1 879 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948
TS
880}
881
882##
883## bool _eqArray(
884## arrayref of arrayref[weights] source,
885## arrayref of arrayref[weights] substr,
886## int level)
887## * comparison of graphemes vs graphemes.
888## @$source >= @$substr must be true (check it before call this);
889##
890sub _eqArray($$$)
891{
892 my $source = shift;
893 my $substr = shift;
894 my $lev = shift;
895
896 for my $g (0..@$substr-1){
897 # Do the $g'th graphemes have the same number of AV weigths?
898 return if @{ $source->[$g] } != @{ $substr->[$g] };
899
900 for my $w (0..@{ $substr->[$g] }-1) {
901 for my $v (0..$lev-1) {
902 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
903 }
904 }
905 }
906 return 1;
907}
908
909##
910## (int position, int length)
911## int position = index(string, substring, position, [undoc'ed grobal])
912##
913## With "grobal" (only for the list context),
914## returns list of arrayref[position, length].
915##
916sub index
917{
91ae00cb
NC
918 my $self = shift;
919 my $str = shift;
920 my $len = length($str);
921 my $subE = $self->splitEnt(shift);
922 my $pos = @_ ? shift : 0;
923 $pos = 0 if $pos < 0;
924 my $grob = shift;
925
926 my $lev = $self->{level};
3756e7ca
RGS
927 my $v2i = $self->{UCA_Version} >= 9 &&
928 $self->{variable} ne 'non-ignorable';
91ae00cb
NC
929
930 if (! @$subE) {
4d36a948
TS
931 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
932 return $grob
933 ? map([$_, 0], $temp..$len)
934 : wantarray ? ($temp,0) : $temp;
935 }
abd1ec54
NC
936 $len < $pos
937 and return wantarray ? () : NOMATCHPOS;
91ae00cb 938 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
abd1ec54
NC
939 @$strE
940 or return wantarray ? () : NOMATCHPOS;
941
4d36a948
TS
942 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
943
abd1ec54
NC
944 my $last_is_variable;
945 for my $vwt (map $self->getWt($_), @$subE) {
946 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
947 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 948
3756e7ca
RGS
949 # "Ignorable (L1, L2) after Variable" since track. v. 9
950 if ($v2i) {
abd1ec54
NC
951 if ($var) {
952 $last_is_variable = TRUE;
953 }
954 elsif (!$wt[0]) { # ignorable
4d36a948 955 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
956 }
957 else {
958 $last_is_variable = FALSE;
4d36a948
TS
959 }
960 }
961
abd1ec54
NC
962 if (@subWt && !$var && !$wt[0]) {
963 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
4d36a948 964 } else {
abd1ec54 965 push @subWt, [ \@wt ];
4d36a948
TS
966 }
967 }
968
969 my $count = 0;
91ae00cb 970 my $end = @$strE - 1;
4d36a948 971
abd1ec54 972 $last_is_variable = FALSE; # reuse
4d36a948
TS
973 for (my $i = 0; $i <= $end; ) { # no $i++
974 my $found_base = 0;
975
976 # fetch a grapheme
977 while ($i <= $end && $found_base == 0) {
abd1ec54
NC
978 for my $vwt ($self->getWt($strE->[$i][0])) {
979 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
980 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 981
3756e7ca
RGS
982 # "Ignorable (L1, L2) after Variable" since track. v. 9
983 if ($v2i) {
abd1ec54
NC
984 if ($var) {
985 $last_is_variable = TRUE;
986 }
987 elsif (!$wt[0]) { # ignorable
4d36a948 988 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
989 }
990 else {
991 $last_is_variable = FALSE;
4d36a948
TS
992 }
993 }
994
abd1ec54
NC
995 if (@strWt && !$var && !$wt[0]) {
996 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
91ae00cb 997 $finPos[-1] = $strE->[$i][2];
4d36a948 998 } elsif ($to_be_pushed) {
abd1ec54 999 push @strWt, [ \@wt ];
91ae00cb 1000 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 1001 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 1002 push @finPos, $strE->[$i][2];
4d36a948
TS
1003 $found_base++;
1004 }
1005 # else ===> no-op
1006 }
1007 $i++;
1008 }
1009
1010 # try to match
1011 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1012 if ($iniPos[0] != NOMATCHPOS &&
1013 $finPos[$#subWt] != NOMATCHPOS &&
1014 _eqArray(\@strWt, \@subWt, $lev)) {
1015 my $temp = $iniPos[0] + $pos;
1016
1017 if ($grob) {
1018 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1019 splice @strWt, 0, $#subWt;
1020 splice @iniPos, 0, $#subWt;
1021 splice @finPos, 0, $#subWt;
1022 }
1023 else {
1024 return wantarray
1025 ? ($temp, $finPos[$#subWt] - $iniPos[0])
1026 : $temp;
1027 }
1028 }
1029 shift @strWt;
1030 shift @iniPos;
1031 shift @finPos;
1032 }
1033 }
1034
1035 return $grob
1036 ? @g_ret
1037 : wantarray ? () : NOMATCHPOS;
1038}
1039
1040##
1041## scalarref to matching part = match(string, substring)
1042##
1043sub match
1044{
1045 my $self = shift;
1046 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1047 my $temp = substr($_[0], $pos, $len);
1048 return wantarray ? $temp : \$temp;
1049 # An lvalue ref \substr should be avoided,
1050 # since its value is affected by modification of its referent.
1051 }
1052 else {
1053 return;
1054 }
1055}
1056
1057##
1058## arrayref matching parts = gmatch(string, substring)
1059##
1060sub gmatch
1061{
1062 my $self = shift;
1063 my $str = shift;
1064 my $sub = shift;
1065 return map substr($str, $_->[0], $_->[1]),
1066 $self->index($str, $sub, 0, 'g');
1067}
1068
1069##
1070## bool subst'ed = subst(string, substring, replace)
1071##
1072sub subst
1073{
1074 my $self = shift;
1075 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1076
1077 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1078 if ($code) {
1079 my $mat = substr($_[0], $pos, $len);
1080 substr($_[0], $pos, $len, $code->($mat));
1081 } else {
1082 substr($_[0], $pos, $len, $_[2]);
1083 }
1084 return TRUE;
1085 }
1086 else {
1087 return FALSE;
1088 }
1089}
1090
1091##
1092## int count = gsubst(string, substring, replace)
1093##
1094sub gsubst
1095{
1096 my $self = shift;
1097 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1098 my $cnt = 0;
1099
1100 # Replacement is carried out from the end, then use reverse.
1101 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1102 if ($code) {
1103 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1104 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1105 } else {
1106 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1107 }
1108 $cnt++;
1109 }
1110 return $cnt;
1111}
1112
45394607
JH
11131;
1114__END__
1115
1116=head1 NAME
1117
a7fbee98 1118Unicode::Collate - Unicode Collation Algorithm
45394607
JH
1119
1120=head1 SYNOPSIS
1121
1122 use Unicode::Collate;
1123
1124 #construct
5398038e 1125 $Collator = Unicode::Collate->new(%tailoring);
45394607
JH
1126
1127 #sort
5398038e 1128 @sorted = $Collator->sort(@not_sorted);
45394607
JH
1129
1130 #compare
a7fbee98 1131 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1132
91ae00cb
NC
1133 # If %tailoring is false (i.e. empty),
1134 # $Collator should do the default collation.
1135
45394607
JH
1136=head1 DESCRIPTION
1137
3756e7ca
RGS
1138This module is an implementation of Unicode Technical Standard #10
1139(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
4d36a948 1140
45394607
JH
1141=head2 Constructor and Tailoring
1142
d16e9e3d
JH
1143The C<new> method returns a collator object.
1144
5398038e 1145 $Collator = Unicode::Collate->new(
0116f5dc 1146 UCA_Version => $UCA_Version,
91ae00cb 1147 alternate => $alternate, # deprecated: use of 'variable' is recommended.
45394607
JH
1148 backwards => $levelNumber, # or \@levelNumbers
1149 entry => $element,
91ae00cb 1150 hangul_terminator => $term_primary_weight,
45394607
JH
1151 ignoreName => qr/$ignoreName/,
1152 ignoreChar => qr/$ignoreChar/,
1153 katakana_before_hiragana => $bool,
1154 level => $collationLevel,
91ae00cb 1155 normalization => $normalization_form,
45394607
JH
1156 overrideCJK => \&overrideCJK,
1157 overrideHangul => \&overrideHangul,
1158 preprocess => \&preprocess,
1159 rearrange => \@charList,
1160 table => $filename,
1161 undefName => qr/$undefName/,
1162 undefChar => qr/$undefChar/,
1163 upper_before_lower => $bool,
91ae00cb 1164 variable => $variable,
45394607 1165 );
45394607
JH
1166
1167=over 4
1168
0116f5dc
JH
1169=item UCA_Version
1170
3756e7ca
RGS
1171If the tracking version number of UCA is given,
1172behavior of that tracking version is emulated on collating.
0116f5dc 1173If omitted, the return value of C<UCA_Version()> is used.
3756e7ca
RGS
1174C<UCA_Version()> should return the latest tracking version supported.
1175
1176The supported tracking version: 8, 9, 11, or 14.
1177
0d50d293
RGS
1178 UCA Unicode Standard DUCET (@version)
1179 ---------------------------------------------------
1180 8 3.1 3.0.1 (3.0.1d9)
1181 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1182 11 4.0 4.0.0 (4.0.0)
1183 14 4.1.0 4.1.0 (4.1.0)
3756e7ca
RGS
1184
1185Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1186
1187=item alternate
0116f5dc 1188
3756e7ca 1189-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
0116f5dc 1190
3756e7ca
RGS
1191For backward compatibility, C<alternate> (old name) can be used
1192as an alias for C<variable>.
0116f5dc 1193
45394607
JH
1194=item backwards
1195
4d36a948 1196-- see 3.1.2 French Accents, UTS #10.
45394607
JH
1197
1198 backwards => $levelNumber or \@levelNumbers
1199
1200Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1201If omitted, forwards at all the levels.
1202
1203=item entry
1204
4d36a948 1205-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1206
91ae00cb
NC
1207If the same character (or a sequence of characters) exists
1208in the collation element table through C<table>,
1209mapping to collation elements is overrided.
1210If it does not exist, the mapping is defined additionally.
45394607 1211
abd1ec54
NC
1212 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
12130063 0068 ; [.0E6A.0020.0002.0063] # ch
12140043 0068 ; [.0E6A.0020.0007.0043] # Ch
12150043 0048 ; [.0E6A.0020.0008.0043] # CH
1216006C 006C ; [.0F4C.0020.0002.006C] # ll
1217004C 006C ; [.0F4C.0020.0007.004C] # Ll
1218004C 004C ; [.0F4C.0020.0008.004C] # LL
e7f779c8
RGS
121900F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1220006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
122100D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1222004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
abd1ec54
NC
1223ENTRY
1224
1225 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
122600E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
122700C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1228ENTRY
45394607 1229
4d36a948 1230B<NOTE:> The code point in the UCA file format (before C<';'>)
abd1ec54
NC
1231B<must> be a Unicode code point (defined as hexadecimal),
1232but not a native code point.
4d36a948
TS
1233So C<0063> must always denote C<U+0063>,
1234but not a character of C<"\x63">.
1235
abd1ec54
NC
1236Weighting may vary depending on collation element table.
1237So ensure the weights defined in C<entry> will be consistent with
1238those in the collation element table loaded via C<table>.
1239
1240In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1241and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1242(as a value between C<0E60> and C<0E6D>)
1243makes ordering as C<C E<lt> CH E<lt> D>.
1244Exactly speaking DUCET already has some characters between C<C> and C<D>:
1245C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1246C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1247and C<c-curl> (C<U+0255>) with C<0E69>.
1248Then primary weight C<0E6A> for C<CH> makes C<CH>
1249ordered between C<c-curl> and C<D>.
1250
91ae00cb
NC
1251=item hangul_terminator
1252
3756e7ca 1253-- see 7.1.4 Trailing Weights, UTS #10.
91ae00cb
NC
1254
1255If a true value is given (non-zero but should be positive),
1256it will be added as a terminator primary weight to the end of
1257every standard Hangul syllable. Secondary and any higher weights
1258for terminator are set to zero.
1259If the value is false or C<hangul_terminator> key does not exist,
1260insertion of terminator weights will not be performed.
1261
1262Boundaries of Hangul syllables are determined
1263according to conjoining Jamo behavior in F<the Unicode Standard>
1264and F<HangulSyllableType.txt>.
1265
1266B<Implementation Note:>
1267(1) For expansion mapping (Unicode character mapped
1268to a sequence of collation elements), a terminator will not be added
1269between collation elements, even if Hangul syllable boundary exists there.
1270Addition of terminator is restricted to the next position
1271to the last collation element.
1272
1273(2) Non-conjoining Hangul letters
1274(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1275automatically terminated with a terminator primary weight.
1276These characters may need terminator included in a collation element
1277table beforehand.
1278
45394607
JH
1279=item ignoreChar
1280
3756e7ca
RGS
1281=item ignoreName
1282
1283-- see 3.2.2 Variable Weighting, UTS #10.
45394607 1284
caffd4cf
TS
1285Makes the entry in the table completely ignorable;
1286i.e. as if the weights were zero at all level.
45394607 1287
3756e7ca
RGS
1288Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1289will be ignored. Through C<ignoreName>, any character whose name
1290(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1291will be ignored.
1292
a7fbee98 1293E.g. when 'a' and 'e' are ignorable,
45394607
JH
1294'element' is equal to 'lament' (or 'lmnt').
1295
3756e7ca
RGS
1296=item katakana_before_hiragana
1297
1298-- see 7.3.1 Tertiary Weight Table, UTS #10.
1299
1300By default, hiragana is before katakana.
1301If the parameter is made true, this is reversed.
1302
1303B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1304distinctions must occur in level 3, and their weights at level 3 must be
1305same as those mentioned in 7.3.1, UTS #10.
1306If you define your collation elements which violate this requirement,
1307this parameter does not work validly.
1308
45394607
JH
1309=item level
1310
3756e7ca 1311-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1312
1313Set the maximum level.
1314Any higher levels than the specified one are ignored.
1315
1316 Level 1: alphabetic ordering
1317 Level 2: diacritic ordering
1318 Level 3: case ordering
91ae00cb 1319 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607
JH
1320
1321 ex.level => 2,
1322
a7fbee98
JH
1323If omitted, the maximum is the 4th.
1324
45394607
JH
1325=item normalization
1326
3756e7ca 1327-- see 4.1 Normalize, UTS #10.
45394607 1328
905aa9f0 1329If specified, strings are normalized before preparation of sort keys
45394607
JH
1330(the normalization is executed after preprocess).
1331
1d2654e1
JH
1332A form name C<Unicode::Normalize::normalize()> accepts will be applied
1333as C<$normalization_form>.
06c8fc8f 1334Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1
JH
1335See C<Unicode::Normalize::normalize()> for detail.
1336If omitted, C<'NFD'> is used.
45394607 1337
91ae00cb 1338C<normalization> is performed after C<preprocess> (if defined).
45394607 1339
06c8fc8f
RGS
1340Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1341though they are not concerned with C<Unicode::Normalize::normalize()>.
1342
1343If C<undef> (not a string C<"undef">) is passed explicitly
1344as the value for this key,
45394607 1345any normalization is not carried out (this may make tailoring easier
abd1ec54
NC
1346if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1347only contiguous contractions are resolved;
1348e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1349C<A-cedilla-ring> would be primary equal to C<A>.
06c8fc8f
RGS
1350In this point,
1351C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1352B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1353
1354In the case of C<(normalization =E<gt> "prenormalized")>,
1355any normalization is not performed, but
1356non-contiguous contractions with combining characters are performed.
1357Therefore
1358C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1359B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1360If source strings are finely prenormalized,
1361C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1362
1363Except C<(normalization =E<gt> undef)>,
1364B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607
JH
1365
1366=item overrideCJK
1367
4d36a948 1368-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1369
91ae00cb 1370By default, CJK Unified Ideographs are ordered in Unicode codepoint order
3756e7ca
RGS
1371but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is
1372C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>)
1373are lesser than C<CJK Unified Ideographs Extension> (its range is
1374C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
91ae00cb
NC
1375
1376Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
45394607 1377
a7fbee98 1378ex. CJK Unified Ideographs in the JIS code point order.
45394607
JH
1379
1380 overrideCJK => sub {
a7fbee98
JH
1381 my $u = shift; # get a Unicode codepoint
1382 my $b = pack('n', $u); # to UTF-16BE
1383 my $s = your_unicode_to_sjis_converter($b); # convert
1384 my $n = unpack('n', $s); # convert sjis to short
1385 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607
JH
1386 },
1387
a7fbee98
JH
1388ex. ignores all CJK Unified Ideographs.
1389
1390 overrideCJK => sub {()}, # CODEREF returning empty list
1391
1392 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1393 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1394
1395If C<undef> is passed explicitly as the value for this key,
1396weights for CJK Unified Ideographs are treated as undefined.
1397But assignment of weight for CJK Unified Ideographs
91ae00cb 1398in table or C<entry> is still valid.
a7fbee98
JH
1399
1400=item overrideHangul
1401
4d36a948 1402-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1403
abd1ec54
NC
1404By default, Hangul Syllables are decomposed into Hangul Jamo,
1405even if C<(normalization =E<gt> undef)>.
a7fbee98
JH
1406But the mapping of Hangul Syllables may be overrided.
1407
3756e7ca 1408This parameter works like C<overrideCJK>, so see there for examples.
a7fbee98 1409
45394607 1410If you want to override the mapping of Hangul Syllables,
abd1ec54
NC
1411NFD, NFKD, and FCD are not appropriate,
1412since they will decompose Hangul Syllables before overriding.
45394607 1413
a7fbee98
JH
1414If C<undef> is passed explicitly as the value for this key,
1415weight for Hangul Syllables is treated as undefined
1416without decomposition into Hangul Jamo.
1417But definition of weight for Hangul Syllables
91ae00cb 1418in table or C<entry> is still valid.
a7fbee98 1419
45394607
JH
1420=item preprocess
1421
4d36a948 1422-- see 5.1 Preprocessing, UTS #10.
45394607
JH
1423
1424If specified, the coderef is used to preprocess
1425before the formation of sort keys.
1426
a7fbee98 1427ex. dropping English articles, such as "a" or "the".
45394607
JH
1428Then, "the pen" is before "a pencil".
1429
1430 preprocess => sub {
1431 my $str = shift;
a7fbee98 1432 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1433 return $str;
45394607
JH
1434 },
1435
91ae00cb 1436C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1437
45394607
JH
1438=item rearrange
1439
4d36a948 1440-- see 3.1.3 Rearrangement, UTS #10.
45394607
JH
1441
1442Characters that are not coded in logical order and to be rearranged.
3756e7ca 1443If C<UCA_Version> is equal to or lesser than 11, default is:
45394607
JH
1444
1445 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1446
3756e7ca
RGS
1447If you want to disallow any rearrangement, pass C<undef> or C<[]>
1448(a reference to empty list) as the value for this key.
1449
1450If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement).
a7fbee98 1451
0116f5dc
JH
1452B<According to the version 9 of UCA, this parameter shall not be used;
1453but it is not warned at present.>
1454
45394607
JH
1455=item table
1456
4d36a948 1457-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1458
91ae00cb 1459You can use another collation element table if desired.
45394607 1460
e7f779c8 1461The table file should locate in the F<Unicode/Collate> directory
3756e7ca
RGS
1462on C<@INC>. Say, if the filename is F<Foo.txt>,
1463the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
e7f779c8
RGS
1464
1465By default, F<allkeys.txt> (as the filename of DUCET) is used.
0d50d293
RGS
1466If you will prepare your own table file, any name other than F<allkeys.txt>
1467may be better to avoid namespace conflict.
45394607 1468
a7fbee98 1469If C<undef> is passed explicitly as the value for this key,
91ae00cb 1470no file is read (but you can define collation elements via C<entry>).
a7fbee98
JH
1471
1472A typical way to define a collation element table
1473without any file of table:
1474
1475 $onlyABC = Unicode::Collate->new(
1476 table => undef,
1477 entry => << 'ENTRIES',
14780061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
14790041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
14800062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
14810042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
14820063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
14830043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1484ENTRIES
1485 );
905aa9f0 1486
3756e7ca
RGS
1487If C<ignoreName> or C<undefName> is used, character names should be
1488specified as a comment (following C<#>) on each line.
45394607
JH
1489
1490=item undefChar
1491
3756e7ca
RGS
1492=item undefName
1493
4d36a948 1494-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607
JH
1495
1496Undefines the collation element as if it were unassigned in the table.
1497This reduces the size of the table.
1498If an unassigned character appears in the string to be collated,
1499the sort key is made from its codepoint
1500as a single-character collation element,
1501as it is greater than any other assigned collation elements
1502(in the codepoint order among the unassigned characters).
1503But, it'd be better to ignore characters
1504unfamiliar to you and maybe never used.
1505
3756e7ca
RGS
1506Through C<undefChar>, any character matching C<qr/$undefChar/>
1507will be undefined. Through C<undefName>, any character whose name
1508(given in the C<table> file as a comment) matches C<qr/$undefName/>
1509will be undefined.
1510
e7f779c8
RGS
1511ex. Collation weights for beyond-BMP characters are not stored in object:
1512
1513 undefChar => qr/[^\0-\x{fffd}]/,
1514
45394607
JH
1515=item upper_before_lower
1516
3756e7ca 1517-- see 6.6 Case Comparisons, UTS #10.
45394607 1518
3756e7ca
RGS
1519By default, lowercase is before uppercase.
1520If the parameter is made true, this is reversed.
45394607 1521
3756e7ca
RGS
1522B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1523distinctions must occur in level 3, and their weights at level 3 must be
1524same as those mentioned in 7.3.1, UTS #10.
1525If you define your collation elements which differs from this requirement,
1526this parameter doesn't work validly.
45394607 1527
91ae00cb
NC
1528=item variable
1529
91ae00cb
NC
1530-- see 3.2.2 Variable Weighting, UTS #10.
1531
91ae00cb
NC
1532This key allows to variable weighting for variable collation elements,
1533which are marked with an ASTERISK in the table
1534(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1535
1536 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1537
1538These names are case-insensitive.
1539By default (if specification is omitted), 'shifted' is adopted.
1540
1541 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1542 considered at the 4th level.
1543
abd1ec54 1544 'Non-Ignorable' Variable elements are not reset to ignorable.
91ae00cb
NC
1545
1546 'Shifted' Variable elements are made ignorable at levels 1 through 3
1547 their level 4 weight is replaced by the old level 1 weight.
1548 Level 4 weight for Non-Variable elements is 0xFFFF.
1549
1550 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1551 are trimmed.
1552
45394607
JH
1553=back
1554
3164dd77 1555=head2 Methods for Collation
45394607
JH
1556
1557=over 4
1558
5398038e 1559=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607
JH
1560
1561Sorts a list of strings.
1562
5398038e 1563=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607
JH
1564
1565Returns 1 (when C<$a> is greater than C<$b>)
1566or 0 (when C<$a> is equal to C<$b>)
1567or -1 (when C<$a> is lesser than C<$b>).
1568
5398038e
TS
1569=item C<$result = $Collator-E<gt>eq($a, $b)>
1570
1571=item C<$result = $Collator-E<gt>ne($a, $b)>
1572
1573=item C<$result = $Collator-E<gt>lt($a, $b)>
1574
1575=item C<$result = $Collator-E<gt>le($a, $b)>
1576
1577=item C<$result = $Collator-E<gt>gt($a, $b)>
1578
1579=item C<$result = $Collator-E<gt>ge($a, $b)>
1580
a7fbee98 1581They works like the same name operators as theirs.
5398038e
TS
1582
1583 eq : whether $a is equal to $b.
1584 ne : whether $a is not equal to $b.
1585 lt : whether $a is lesser than $b.
1586 le : whether $a is lesser than $b or equal to $b.
1587 gt : whether $a is greater than $b.
1588 ge : whether $a is greater than $b or equal to $b.
1589
1590=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1591
3756e7ca 1592-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1593
1594Returns a sort key.
1595
1596You compare the sort keys using a binary comparison
1597and get the result of the comparison of the strings using UCA.
1598
5398038e 1599 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607
JH
1600
1601 is equivalent to
1602
5398038e 1603 $Collator->cmp($a, $b)
45394607 1604
a7fbee98
JH
1605=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1606
3756e7ca
RGS
1607Converts a sorting key into its representation form.
1608If C<UCA_Version> is 8, the output is slightly different.
1609
a7fbee98
JH
1610 use Unicode::Collate;
1611 my $c = Unicode::Collate->new();
1612 print $c->viewSortKey("Perl"),"\n";
1613
0116f5dc
JH
1614 # output:
1615 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1616 # Level 1 Level 2 Level 3 Level 4
1617
4d36a948
TS
1618=back
1619
1620=head2 Methods for Searching
d16e9e3d 1621
3756e7ca 1622B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
4d36a948
TS
1623for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1624C<subst>, C<gsubst>) is croaked,
1625as the position and the length might differ
1626from those on the specified string.
3756e7ca 1627(And C<rearrange> and C<hangul_terminator> parameters are neglected.)
d16e9e3d 1628
4d36a948
TS
1629The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1630like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1631but they are not aware of any pattern, but only a literal substring.
1632
1633=over 4
1634
1635=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1636
1637=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d
JH
1638
1639If C<$substring> matches a part of C<$string>, returns
1640the position of the first occurrence of the matching part in scalar context;
1641in list context, returns a two-element list of
1642the position and the length of the matching part.
1643
d16e9e3d
JH
1644If C<$substring> does not match any part of C<$string>,
1645returns C<-1> in scalar context and
1646an empty list in list context.
1647
1648e.g. you say
1649
5398038e 1650 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948
TS
1651 # (normalization => undef) is REQUIRED.
1652 my $str = "Ich muß studieren Perl.";
1653 my $sub = "MÃœSS";
d16e9e3d 1654 my $match;
a7fbee98 1655 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1656 $match = substr($str, $pos, $len);
d16e9e3d
JH
1657 }
1658
4d36a948 1659and get C<"muß"> in C<$match> since C<"muß">
3756e7ca 1660is primary equal to C<"MÃœSS">.
4d36a948
TS
1661
1662=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1663
1664=item C<($match) = $Collator-E<gt>match($string, $substring)>
1665
1666If C<$substring> matches a part of C<$string>, in scalar context, returns
1667B<a reference to> the first occurrence of the matching part
1668(C<$match_ref> is always true if matches,
1669since every reference is B<true>);
1670in list context, returns the first occurrence of the matching part.
1671
1672If C<$substring> does not match any part of C<$string>,
1673returns C<undef> in scalar context and
1674an empty list in list context.
1675
1676e.g.
1677
1678 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1679 print "matches [$$match_ref].\n";
1680 } else {
1681 print "doesn't match.\n";
1682 }
1683
3756e7ca 1684 or
4d36a948
TS
1685
1686 if (($match) = $Collator->match($str, $sub)) { # list context
1687 print "matches [$match].\n";
1688 } else {
1689 print "doesn't match.\n";
1690 }
1691
1692=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1693
1694If C<$substring> matches a part of C<$string>, returns
1695all the matching parts (or matching count in scalar context).
1696
1697If C<$substring> does not match any part of C<$string>,
1698returns an empty list.
1699
1700=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1701
1702If C<$substring> matches a part of C<$string>,
1703the first occurrence of the matching part is replaced by C<$replacement>
1704(C<$string> is modified) and return C<$count> (always equals to C<1>).
1705
1706C<$replacement> can be a C<CODEREF>,
1707taking the matching part as an argument,
1708and returning a string to replace the matching part
1709(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1710
1711=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1712
1713If C<$substring> matches a part of C<$string>,
1714all the occurrences of the matching part is replaced by C<$replacement>
1715(C<$string> is modified) and return C<$count>.
1716
1717C<$replacement> can be a C<CODEREF>,
1718taking the matching part as an argument,
1719and returning a string to replace the matching part
1720(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1721
1722e.g.
1723
1724 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1725 # (normalization => undef) is REQUIRED.
3756e7ca 1726 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
4d36a948
TS
1727 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1728
3756e7ca 1729 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
4d36a948 1730 # i.e., all the camels are made bold-faced.
d16e9e3d 1731
45394607
JH
1732=back
1733
3164dd77
TS
1734=head2 Other Methods
1735
1736=over 4
1737
0116f5dc
JH
1738=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1739
1740Change the value of specified keys and returns the changed part.
1741
1742 $Collator = Unicode::Collate->new(level => 4);
1743
1744 $Collator->eq("perl", "PERL"); # false
1745
1746 %old = $Collator->change(level => 2); # returns (level => 4).
1747
1748 $Collator->eq("perl", "PERL"); # true
1749
1750 $Collator->change(%old); # returns (level => 2).
1751
1752 $Collator->eq("perl", "PERL"); # false
1753
1754Not all C<(key,value)>s are allowed to be changed.
1755See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1756
1757In the scalar context, returns the modified collator
1758(but it is B<not> a clone from the original).
1759
1760 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1761
1762 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1763
1764 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1765
91ae00cb 1766=item C<$version = $Collator-E<gt>version()>
3164dd77 1767
91ae00cb
NC
1768Returns the version number (a string) of the Unicode Standard
1769which the C<table> file used by the collator object is based on.
1770If the table does not include a version line (starting with C<@version>),
1771returns C<"unknown">.
1772
1773=item C<UCA_Version()>
3164dd77 1774
91ae00cb 1775Returns the tracking version number of UTS #10 this module consults.
3164dd77 1776
91ae00cb
NC
1777=item C<Base_Unicode_Version()>
1778
1779Returns the version number of UTS #10 this module consults.
3164dd77
TS
1780
1781=back
1782
3756e7ca
RGS
1783=head1 EXPORT
1784
1785No method will be exported.
45394607 1786
0d50d293
RGS
1787=head1 INSTALL
1788
1789Though this module can be used without any C<table> file,
1790to use this module easily, it is recommended to install a table file
1791in the UCA format, by copying it under the directory
1792<a place in @INC>/Unicode/Collate.
1793
6d24ed10
SP
1794The most preferable one is "The Default Unicode Collation Element Table"
1795(aka DUCET), available from the Unicode Consortium's website:
0d50d293
RGS
1796
1797 http://www.unicode.org/Public/UCA/
1798
1799 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1800
1801If DUCET is not installed, it is recommended to copy the file
1802from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1803to <a place in @INC>/Unicode/Collate/allkeys.txt
1804manually.
1805
3756e7ca
RGS
1806=head1 CAVEATS
1807
1808=over 4
45394607 1809
3756e7ca 1810=item Normalization
45394607 1811
3756e7ca
RGS
1812Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1813module (see L<Unicode::Normalize>).
45394607 1814
5398038e 1815If you need not it (say, in the case when you need not
45394607
JH
1816handle any combining characters),
1817assign C<normalization =E<gt> undef> explicitly.
1818
4d36a948 1819-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1820
3756e7ca 1821=item Conformance Test
0116f5dc 1822
10d7ec48
NC
1823The Conformance Test for the UCA is available
1824under L<http://www.unicode.org/Public/UCA/>.
0116f5dc
JH
1825
1826For F<CollationTest_SHIFTED.txt>,
1827a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1828for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 1829C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 1830
4d36a948 1831B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1832
3756e7ca
RGS
1833=back
1834
0d50d293 1835=head1 AUTHOR, COPYRIGHT AND LICENSE
45394607 1836
0d50d293
RGS
1837The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1838<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2005,
1839SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1840
0d50d293
RGS
1841This module is free software; you can redistribute it and/or
1842modify it under the same terms as Perl itself.
45394607 1843
0d50d293 1844The file Unicode/Collate/allkeys.txt was copied directly
6d24ed10 1845from L<http://www.unicode.org/Public/UCA/4.1.0/allkeys.txt>.
0d50d293 1846This file is Copyright (c) 1991-2005 Unicode, Inc. All rights reserved.
6d24ed10 1847Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
45394607
JH
1848
1849=head1 SEE ALSO
1850
1851=over 4
1852
91ae00cb
NC
1853=item Unicode Collation Algorithm - UTS #10
1854
1855L<http://www.unicode.org/reports/tr10/>
1856
1857=item The Default Unicode Collation Element Table (DUCET)
1858
10d7ec48 1859L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 1860
91ae00cb 1861=item The conformance test for the UCA
45394607 1862
10d7ec48 1863L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 1864
10d7ec48 1865L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 1866
91ae00cb 1867=item Hangul Syllable Type
0116f5dc 1868
10d7ec48 1869L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 1870
91ae00cb 1871=item Unicode Normalization Forms - UAX #15
a7fbee98 1872
91ae00cb 1873L<http://www.unicode.org/reports/tr15/>
a7fbee98 1874
45394607
JH
1875=back
1876
1877=cut