This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
In bodies_by_type in sv.c, no need to store for values for HEs.
[perl5.git] / cpan / Unicode-Collate / Collate.pm
CommitLineData
45394607
JH
1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c
JH
6 }
7}
8
45394607
JH
9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
10d7ec48
NC
15no warnings 'utf8';
16
456a1446 17our $VERSION = '0.56';
45394607
JH
18our $PACKAGE = __PACKAGE__;
19
e7f779c8
RGS
20my @Path = qw(Unicode Collate);
21my $KeyFile = "allkeys.txt";
45394607 22
4d36a948
TS
23# Perl's boolean
24use constant TRUE => 1;
25use constant FALSE => "";
26use constant NOMATCHPOS => -1;
27
28# A coderef to get combining class imported from Unicode::Normalize
29# (i.e. \&Unicode::Normalize::getCombinClass).
30# This is also used as a HAS_UNICODE_NORMALIZE flag.
e7f779c8 31my $CVgetCombinClass;
4d36a948 32
9f1f04a1
RGS
33# Supported Levels
34use constant MinLevel => 1;
35use constant MaxLevel => 4;
36
4d36a948 37# Minimum weights at level 2 and 3, respectively
9f1f04a1
RGS
38use constant Min2Wt => 0x20;
39use constant Min3Wt => 0x02;
4d36a948
TS
40
41# Shifted weight at 4th level
9f1f04a1 42use constant Shift4Wt => 0xFFFF;
4d36a948 43
4d36a948
TS
44# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45# PROBLEM: The Default Unicode Collation Element Table
46# has weights over 0xFFFF at the 4th level.
47# The tie-breaking in the variable weights
48# other than "shift" (as well as "shift-trimmed") is unreliable.
49use constant VCE_TEMPLATE => 'Cn4';
50
4d36a948
TS
51# A sort key: 16-bit weights
52# See also the PROBLEM on VCE_TEMPLATE above.
53use constant KEY_TEMPLATE => 'n*';
54
55# Level separator in a sort key:
56# i.e. pack(KEY_TEMPLATE, 0)
57use constant LEVEL_SEP => "\0\0";
58
59# As Unicode code point separator for hash keys.
60# A joined code point string (denoted by JCPS below)
61# like "65;768" is used for internal processing
62# instead of Perl's Unicode string like "\x41\x{300}",
63# as the native code point is different from the Unicode code point
64# on EBCDIC platform.
65# This character must not be included in any stringified
66# representation of an integer.
67use constant CODE_SEP => ';';
68
69# boolean values of variable weights
0116f5dc
JH
70use constant NON_VAR => 0; # Non-Variable character
71use constant VAR => 1; # Variable character
3164dd77 72
91ae00cb 73# specific code points
00e00351
CBW
74use constant Hangul_SBase => 0xAC00;
75use constant Hangul_SIni => 0xAC00;
76use constant Hangul_SFin => 0xD7A3;
77use constant Hangul_NCount => 588;
78use constant Hangul_TCount => 28;
91ae00cb
NC
79use constant Hangul_LBase => 0x1100;
80use constant Hangul_LIni => 0x1100;
81use constant Hangul_LFin => 0x1159;
82use constant Hangul_LFill => 0x115F;
00e00351 83use constant Hangul_LEnd => 0x115F; # Unicode 5.2.0
91ae00cb 84use constant Hangul_VBase => 0x1161;
3756e7ca 85use constant Hangul_VIni => 0x1160; # from Vowel Filler
91ae00cb 86use constant Hangul_VFin => 0x11A2;
00e00351 87use constant Hangul_VEnd => 0x11A7; # Unicode 5.2.0
3756e7ca 88use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
91ae00cb
NC
89use constant Hangul_TIni => 0x11A8;
90use constant Hangul_TFin => 0x11F9;
00e00351
CBW
91use constant Hangul_TEnd => 0x11FF; # Unicode 5.2.0
92use constant HangulL2Ini => 0xA960; # Unicode 5.2.0
93use constant HangulL2Fin => 0xA97C; # Unicode 5.2.0
94use constant HangulV2Ini => 0xD7B0; # Unicode 5.2.0
95use constant HangulV2Fin => 0xD7C6; # Unicode 5.2.0
96use constant HangulT2Ini => 0xD7CB; # Unicode 5.2.0
97use constant HangulT2Fin => 0xD7FB; # Unicode 5.2.0
98
91ae00cb
NC
99use constant CJK_UidIni => 0x4E00;
100use constant CJK_UidFin => 0x9FA5;
3756e7ca 101use constant CJK_UidF41 => 0x9FBB;
74b94a79 102use constant CJK_UidF51 => 0x9FC3;
00e00351 103use constant CJK_UidF52 => 0x9FCB;
91ae00cb
NC
104use constant CJK_ExtAIni => 0x3400;
105use constant CJK_ExtAFin => 0x4DB5;
106use constant CJK_ExtBIni => 0x20000;
107use constant CJK_ExtBFin => 0x2A6D6;
00e00351
CBW
108use constant CJK_ExtCIni => 0x2A700; # Unicode 5.2.0
109use constant CJK_ExtCFin => 0x2B734; # Unicode 5.2.0
91ae00cb 110
4d36a948 111# Logical_Order_Exception in PropList.txt
e7f779c8 112my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 113
00e00351 114sub UCA_Version { "20" }
a7fbee98 115
00e00351 116sub Base_Unicode_Version { "5.2.0" }
a7fbee98 117
9f1f04a1
RGS
118######
119
9f1f04a1 120sub pack_U {
ae6aa562 121 return pack('U*', @_);
9f1f04a1
RGS
122}
123
124sub unpack_U {
3756e7ca 125 return unpack('U*', shift(@_).pack('U*'));
9f1f04a1
RGS
126}
127
128######
129
91ae00cb
NC
130my (%VariableOK);
131@VariableOK{ qw/
0116f5dc 132 blanked non-ignorable shifted shift-trimmed
91ae00cb 133 / } = (); # keys lowercased
0116f5dc
JH
134
135our @ChangeOK = qw/
136 alternate backwards level normalization rearrange
137 katakana_before_hiragana upper_before_lower
138 overrideHangul overrideCJK preprocess UCA_Version
91ae00cb 139 hangul_terminator variable
0116f5dc
JH
140 /;
141
142our @ChangeNG = qw/
91ae00cb
NC
143 entry mapping table maxlength
144 ignoreChar ignoreName undefChar undefName variableTable
0116f5dc 145 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
00e00351 146 derivCode normCode rearrangeHash backwardsFlag
0116f5dc 147 /;
9f1f04a1
RGS
148# The hash key 'ignored' is deleted at v 0.21.
149# The hash key 'isShift' is deleted at v 0.23.
150# The hash key 'combining' is deleted at v 0.24.
91ae00cb 151# The hash key 'entries' is deleted at v 0.30.
e7f779c8 152# The hash key 'L3_ignorable' is deleted at v 0.40.
91ae00cb
NC
153
154sub version {
155 my $self = shift;
156 return $self->{versionTable} || 'unknown';
157}
0116f5dc
JH
158
159my (%ChangeOK, %ChangeNG);
160@ChangeOK{ @ChangeOK } = ();
161@ChangeNG{ @ChangeNG } = ();
162
163sub change {
164 my $self = shift;
165 my %hash = @_;
166 my %old;
91ae00cb
NC
167 if (exists $hash{variable} && exists $hash{alternate}) {
168 delete $hash{alternate};
169 }
170 elsif (!exists $hash{variable} && exists $hash{alternate}) {
171 $hash{variable} = $hash{alternate};
172 }
0116f5dc
JH
173 foreach my $k (keys %hash) {
174 if (exists $ChangeOK{$k}) {
175 $old{$k} = $self->{$k};
176 $self->{$k} = $hash{$k};
177 }
178 elsif (exists $ChangeNG{$k}) {
179 croak "change of $k via change() is not allowed!";
180 }
181 # else => ignored
182 }
3756e7ca 183 $self->checkCollator();
0116f5dc
JH
184 return wantarray ? %old : $self;
185}
a7fbee98 186
9f1f04a1
RGS
187sub _checkLevel {
188 my $level = shift;
abd1ec54
NC
189 my $key = shift; # 'level' or 'backwards'
190 MinLevel <= $level or croak sprintf
191 "Illegal level %d (in value for key '%s') lower than %d.",
192 $level, $key, MinLevel;
193 $level <= MaxLevel or croak sprintf
194 "Unsupported level %d (in value for key '%s') higher than %d.",
195 $level, $key, MaxLevel;
9f1f04a1
RGS
196}
197
91ae00cb
NC
198my %DerivCode = (
199 8 => \&_derivCE_8,
200 9 => \&_derivCE_9,
201 11 => \&_derivCE_9, # 11 == 9
3756e7ca 202 14 => \&_derivCE_14,
74b94a79
CBW
203 16 => \&_derivCE_14, # 16 == 14
204 18 => \&_derivCE_18,
00e00351 205 20 => \&_derivCE_20,
91ae00cb
NC
206);
207
0116f5dc
JH
208sub checkCollator {
209 my $self = shift;
9f1f04a1 210 _checkLevel($self->{level}, "level");
a7fbee98 211
91ae00cb
NC
212 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
213 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 214
91ae00cb 215 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 216 $self->{alternateTable} || 'shifted';
91ae00cb
NC
217 $self->{variable} = $self->{alternate} = lc($self->{variable});
218 exists $VariableOK{ $self->{variable} }
3756e7ca 219 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
0116f5dc 220
9f1f04a1
RGS
221 if (! defined $self->{backwards}) {
222 $self->{backwardsFlag} = 0;
223 }
224 elsif (! ref $self->{backwards}) {
225 _checkLevel($self->{backwards}, "backwards");
226 $self->{backwardsFlag} = 1 << $self->{backwards};
227 }
228 else {
229 my %level;
230 $self->{backwardsFlag} = 0;
231 for my $b (@{ $self->{backwards} }) {
232 _checkLevel($b, "backwards");
233 $level{$b} = 1;
234 }
235 for my $v (sort keys %level) {
236 $self->{backwardsFlag} += 1 << $v;
237 }
238 }
0116f5dc 239
91ae00cb
NC
240 defined $self->{rearrange} or $self->{rearrange} = [];
241 ref $self->{rearrange}
242 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc
JH
243
244 # keys of $self->{rearrangeHash} are $self->{rearrange}.
245 $self->{rearrangeHash} = undef;
246
247 if (@{ $self->{rearrange} }) {
248 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
249 }
250
251 $self->{normCode} = undef;
a7fbee98
JH
252
253 if (defined $self->{normalization}) {
254 eval { require Unicode::Normalize };
91ae00cb 255 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 256
91ae00cb 257 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 258
91ae00cb
NC
259 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
260 $self->{normCode} = \&Unicode::Normalize::NFD;
261 }
262 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f
RGS
263 my $norm = $self->{normalization};
264 $self->{normCode} = sub {
1d2654e1
JH
265 Unicode::Normalize::normalize($norm, shift);
266 };
06c8fc8f
RGS
267 eval { $self->{normCode}->("") }; # try
268 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 269 }
a7fbee98 270 }
0116f5dc
JH
271 return;
272}
273
274sub new
275{
276 my $class = shift;
277 my $self = bless { @_ }, $class;
45394607 278
a7fbee98 279 # If undef is passed explicitly, no file is read.
0116f5dc 280 $self->{table} = $KeyFile if ! exists $self->{table};
3756e7ca 281 $self->read_table() if defined $self->{table};
905aa9f0 282
a7fbee98 283 if ($self->{entry}) {
e7f779c8
RGS
284 while ($self->{entry} =~ /([^\n]+)/g) {
285 $self->parseEntry($1);
286 }
a7fbee98 287 }
905aa9f0 288
9f1f04a1 289 $self->{level} ||= MaxLevel;
0116f5dc 290 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 291
abd1ec54 292 $self->{overrideHangul} = FALSE
0116f5dc 293 if ! exists $self->{overrideHangul};
abd1ec54 294 $self->{overrideCJK} = FALSE
0116f5dc 295 if ! exists $self->{overrideCJK};
06c8fc8f 296 $self->{normalization} = 'NFD'
0116f5dc 297 if ! exists $self->{normalization};
3756e7ca
RGS
298 $self->{rearrange} = $self->{rearrangeTable} ||
299 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
a7fbee98 300 if ! exists $self->{rearrange};
0116f5dc
JH
301 $self->{backwards} = $self->{backwardsTable}
302 if ! exists $self->{backwards};
a7fbee98 303
3756e7ca 304 $self->checkCollator();
a7fbee98
JH
305
306 return $self;
307}
905aa9f0 308
00e00351
CBW
309sub parseAtmark {
310 my $self = shift;
311 my $line = shift; # after s/^\s*\@//
312
313 if ($line =~ /^version\s*(\S*)/) {
314 $self->{versionTable} ||= $1;
315 }
316 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
317 $self->{variableTable} ||= $1;
318 }
319 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
320 $self->{alternateTable} ||= $1;
321 }
322 elsif ($line =~ /^backwards\s+(\S*)/) {
323 push @{ $self->{backwardsTable} }, $1;
324 }
325 elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
326 push @{ $self->{forwardsTable} }, $1;
327 }
328 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
329 push @{ $self->{rearrangeTable} }, _getHexArray($1);
330 }
331}
332
905aa9f0 333sub read_table {
a7fbee98 334 my $self = shift;
a7fbee98 335
e7f779c8
RGS
336 my($f, $fh);
337 foreach my $d (@INC) {
338 $f = File::Spec->catfile($d, @Path, $self->{table});
339 last if open($fh, $f);
340 $f = undef;
341 }
0d50d293
RGS
342 if (!defined $f) {
343 $f = File::Spec->catfile(@Path, $self->{table});
344 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
345 }
a7fbee98 346
6d24ed10
SP
347 while (my $line = <$fh>) {
348 next if $line =~ /^\s*#/;
abd1ec54 349
00e00351
CBW
350 if ($line =~ s/^\s*\@//) {
351 $self->parseAtmark($line);
352 } else {
353 $self->parseEntry($line);
abd1ec54 354 }
45394607 355 }
e7f779c8 356 close $fh;
45394607
JH
357}
358
905aa9f0 359
45394607
JH
360##
361## get $line, parse it, and write an entry in $self
362##
363sub parseEntry
364{
a7fbee98
JH
365 my $self = shift;
366 my $line = shift;
4d36a948 367 my($name, $entry, @uv, @key);
a7fbee98
JH
368
369 return if $line !~ /^\s*[0-9A-Fa-f]/;
370
371 # removes comment and gets name
372 $name = $1
373 if $line =~ s/[#%]\s*(.*)//;
374 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
375
376 # gets element
377 my($e, $k) = split /;/, $line;
378 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
379 if ! $k;
380
4d36a948
TS
381 @uv = _getHexArray($e);
382 return if !@uv;
383
384 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 385
4d36a948 386 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 387 my $ele = pack_U(@uv);
a7fbee98 388
4d36a948
TS
389 # regarded as if it were not entried in the table
390 return
391 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 392
4d36a948
TS
393 # replaced as completely ignorable
394 $k = '[.0000.0000.0000.0000]'
395 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 396 }
0116f5dc 397
4d36a948
TS
398 # replaced as completely ignorable
399 $k = '[.0000.0000.0000.0000]'
400 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
401
4c843366 402 my $is_L3_ignorable = TRUE;
4d36a948 403
caffd4cf
TS
404 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
405 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948
TS
406 my @wt = _getHexArray($arr);
407 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 408 $is_L3_ignorable = FALSE
3756e7ca
RGS
409 if $wt[0] || $wt[1] || $wt[2];
410 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
411 # is completely ignorable.
4c843366
JH
412 # For expansion, an entry $is_L3_ignorable
413 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 414 }
caffd4cf 415
e7f779c8 416 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
caffd4cf 417
91ae00cb
NC
418 if (@uv > 1) {
419 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
420 and $self->{maxlength}{$uv[0]} = @uv;
421 }
45394607
JH
422}
423
9f1f04a1 424
45394607 425##
abd1ec54 426## VCE = _varCE(variable term, VCE)
45394607 427##
abd1ec54 428sub _varCE
45394607 429{
abd1ec54
NC
430 my $vbl = shift;
431 my $vce = shift;
432 if ($vbl eq 'non-ignorable') {
433 return $vce;
434 }
435 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
436
437 if ($var) {
438 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
439 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
440 }
441 elsif ($vbl eq 'blanked') {
442 return $vce;
443 }
444 else {
445 return pack(VCE_TEMPLATE, $var, @wt[0..2],
446 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
447 }
45394607
JH
448}
449
45394607
JH
450sub viewSortKey
451{
a7fbee98 452 my $self = shift;
9f1f04a1
RGS
453 $self->visualizeSortKey($self->getSortKey(@_));
454}
0116f5dc 455
9f1f04a1
RGS
456sub visualizeSortKey
457{
458 my $self = shift;
459 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 460
9f1f04a1 461 if ($self->{UCA_Version} <= 8) {
0116f5dc
JH
462 $view =~ s/ ?0000 ?/|/g;
463 } else {
464 $view =~ s/\b0000\b/|/g;
465 }
a7fbee98 466 return "[$view]";
45394607
JH
467}
468
d16e9e3d 469
45394607 470##
91ae00cb
NC
471## arrayref of JCPS = splitEnt(string to be collated)
472## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
45394607 473##
91ae00cb 474sub splitEnt
45394607 475{
a7fbee98 476 my $self = shift;
4d36a948
TS
477 my $wLen = $_[1];
478
a7fbee98 479 my $code = $self->{preprocess};
0116f5dc 480 my $norm = $self->{normCode};
91ae00cb 481 my $map = $self->{mapping};
a7fbee98
JH
482 my $max = $self->{maxlength};
483 my $reH = $self->{rearrangeHash};
3756e7ca 484 my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
a7fbee98 485
4d36a948 486 my ($str, @buf);
a7fbee98 487
4d36a948
TS
488 if ($wLen) {
489 $code and croak "Preprocess breaks character positions. "
490 . "Don't use with index(), match(), etc.";
491 $norm and croak "Normalization breaks character positions. "
492 . "Don't use with index(), match(), etc.";
493 $str = $_[0];
494 }
495 else {
496 $str = $_[0];
497 $str = &$code($str) if ref $code;
498 $str = &$norm($str) if ref $norm;
499 }
a7fbee98 500
4d36a948 501 # get array of Unicode code point of string.
9f1f04a1 502 my @src = unpack_U($str);
4d36a948
TS
503
504 # rearrangement:
505 # Character positions are not kept if rearranged,
506 # then neglected if $wLen is true.
507 if ($reH && ! $wLen) {
a7fbee98
JH
508 for (my $i = 0; $i < @src; $i++) {
509 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
510 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
511 $i++;
512 }
513 }
45394607 514 }
45394607 515
3756e7ca 516 # remove a code point marked as a completely ignorable.
abd1ec54
NC
517 for (my $i = 0; $i < @src; $i++) {
518 $src[$i] = undef
e7f779c8
RGS
519 if _isIllegal($src[$i]) || ($ver9 &&
520 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
0116f5dc
JH
521 }
522
a7fbee98 523 for (my $i = 0; $i < @src; $i++) {
91ae00cb 524 my $jcps = $src[$i];
3756e7ca
RGS
525
526 # skip removed code point
527 if (! defined $jcps) {
528 if ($wLen && @buf) {
529 $buf[-1][2] = $i + 1;
530 }
531 next;
532 }
533
abd1ec54 534 my $i_orig = $i;
4d36a948 535
3756e7ca
RGS
536 # find contraction
537 if ($max->{$jcps}) {
91ae00cb
NC
538 my $temp_jcps = $jcps;
539 my $jcpsLen = 1;
540 my $maxLen = $max->{$jcps};
4d36a948 541
91ae00cb 542 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 543 next if ! defined $src[$p];
91ae00cb
NC
544 $temp_jcps .= CODE_SEP . $src[$p];
545 $jcpsLen++;
546 if ($map->{$temp_jcps}) {
547 $jcps = $temp_jcps;
4d36a948
TS
548 $i = $p;
549 }
550 }
4d36a948 551
06c8fc8f
RGS
552 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
553 # This process requires Unicode::Normalize.
91ae00cb 554 # If "normalization" is undef, here should be skipped *always*
06c8fc8f
RGS
555 # (in spite of bool value of $CVgetCombinClass),
556 # since canonical ordering cannot be expected.
557 # Blocked combining character should not be contracted.
558
559 if ($self->{normalization})
560 # $self->{normCode} is false in the case of "prenormalized".
561 {
562 my $preCC = 0;
563 my $curCC = 0;
564
565 for (my $p = $i + 1; $p < @src; $p++) {
566 next if ! defined $src[$p];
567 $curCC = $CVgetCombinClass->($src[$p]);
568 last unless $curCC;
569 my $tail = CODE_SEP . $src[$p];
91ae00cb
NC
570 if ($preCC != $curCC && $map->{$jcps.$tail}) {
571 $jcps .= $tail;
06c8fc8f
RGS
572 $src[$p] = undef;
573 } else {
574 $preCC = $curCC;
575 }
4d36a948 576 }
a7fbee98 577 }
a7fbee98
JH
578 }
579
3756e7ca
RGS
580 # skip completely ignorable
581 if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
582 if ($wLen && @buf) {
583 $buf[-1][2] = $i + 1;
a7fbee98 584 }
3756e7ca 585 next;
a7fbee98 586 }
4d36a948 587
91ae00cb 588 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 589 }
4d36a948 590 return \@buf;
d16e9e3d 591}
45394607 592
d16e9e3d
JH
593
594##
abd1ec54 595## list of VCE = getWt(JCPS)
d16e9e3d
JH
596##
597sub getWt
598{
a7fbee98 599 my $self = shift;
91ae00cb 600 my $u = shift;
abd1ec54 601 my $vbl = $self->{variable};
91ae00cb 602 my $map = $self->{mapping};
0116f5dc 603 my $der = $self->{derivCode};
a7fbee98 604
91ae00cb 605 return if !defined $u;
abd1ec54 606 return map(_varCE($vbl, $_), @{ $map->{$u} })
91ae00cb 607 if $map->{$u};
a7fbee98 608
91ae00cb
NC
609 # JCPS must not be a contraction, then it's a code point.
610 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1
JH
611 my $hang = $self->{overrideHangul};
612 my @hangulCE;
613 if ($hang) {
614 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
615 }
616 elsif (!defined $hang) {
617 @hangulCE = $der->($u);
618 }
619 else {
620 my $max = $self->{maxlength};
621 my @decH = _decompHangul($u);
622
623 if (@decH == 2) {
624 my $contract = join(CODE_SEP, @decH);
91ae00cb 625 @decH = ($contract) if $map->{$contract};
1d2654e1
JH
626 } else { # must be <@decH == 3>
627 if ($max->{$decH[0]}) {
628 my $contract = join(CODE_SEP, @decH);
91ae00cb 629 if ($map->{$contract}) {
1d2654e1
JH
630 @decH = ($contract);
631 } else {
632 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 633 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1
JH
634 }
635 # even if V's ignorable, LT contraction is not supported.
636 # If such a situatution were required, NFD should be used.
637 }
638 if (@decH == 3 && $max->{$decH[1]}) {
639 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 640 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1
JH
641 }
642 }
643
644 @hangulCE = map({
91ae00cb 645 $map->{$_} ? @{ $map->{$_} } : $der->($_);
1d2654e1
JH
646 } @decH);
647 }
abd1ec54 648 return map _varCE($vbl, $_), @hangulCE;
a7fbee98 649 }
3756e7ca 650 elsif (_isUIdeo($u, $self->{UCA_Version})) {
1d2654e1 651 my $cjk = $self->{overrideCJK};
abd1ec54 652 return map _varCE($vbl, $_),
0116f5dc 653 $cjk
4d36a948 654 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
3756e7ca
RGS
655 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
656 ? _uideoCE_8($u)
0116f5dc 657 : $der->($u);
a7fbee98
JH
658 }
659 else {
abd1ec54 660 return map _varCE($vbl, $_), $der->($u);
a7fbee98 661 }
d16e9e3d
JH
662}
663
d16e9e3d
JH
664
665##
666## string sortkey = getSortKey(string arg)
667##
668sub getSortKey
669{
a7fbee98
JH
670 my $self = shift;
671 my $lev = $self->{level};
91ae00cb 672 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
00e00351
CBW
673 my $vers = $self->{UCA_Version};
674 my $v2i = $vers >= 9 && $self->{variable} ne 'non-ignorable';
91ae00cb 675
abd1ec54 676 my @buf; # weight arrays
91ae00cb
NC
677 if ($self->{hangul_terminator}) {
678 my $preHST = '';
679 foreach my $jcps (@$rEnt) {
680 # weird things like VL, TL-contraction are not considered!
681 my $curHST = '';
682 foreach my $u (split /;/, $jcps) {
00e00351 683 $curHST .= getHST($u, $vers);
91ae00cb
NC
684 }
685 if ($preHST && !$curHST || # hangul before non-hangul
686 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
687 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
688 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
0116f5dc 689
abd1ec54 690 push @buf, $self->getWtHangulTerm();
91ae00cb
NC
691 }
692 $preHST = $curHST;
693
abd1ec54 694 push @buf, $self->getWt($jcps);
91ae00cb
NC
695 }
696 $preHST # end at hangul
abd1ec54 697 and push @buf, $self->getWtHangulTerm();
91ae00cb
NC
698 }
699 else {
700 foreach my $jcps (@$rEnt) {
abd1ec54 701 push @buf, $self->getWt($jcps);
91ae00cb
NC
702 }
703 }
704
abd1ec54
NC
705 # make sort key
706 my @ret = ([],[],[],[]);
707 my $last_is_variable;
708
709 foreach my $vwt (@buf) {
710 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
3756e7ca
RGS
711
712 # "Ignorable (L1, L2) after Variable" since track. v. 9
abd1ec54
NC
713 if ($v2i) {
714 if ($var) {
715 $last_is_variable = TRUE;
716 }
717 elsif (!$wt[0]) { # ignorable
4d36a948 718 next if $last_is_variable;
abd1ec54
NC
719 }
720 else {
721 $last_is_variable = FALSE;
0116f5dc
JH
722 }
723 }
abd1ec54
NC
724 foreach my $v (0..$lev-1) {
725 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
a7fbee98
JH
726 }
727 }
45394607 728
a7fbee98
JH
729 # modification of tertiary weights
730 if ($self->{upper_before_lower}) {
6d24ed10
SP
731 foreach my $w (@{ $ret[2] }) {
732 if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
733 elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
734 elsif ($w == 0x1C) { $w += 1 } # square upper
735 elsif ($w == 0x1D) { $w -= 1 } # square lower
a7fbee98 736 }
45394607 737 }
a7fbee98 738 if ($self->{katakana_before_hiragana}) {
6d24ed10
SP
739 foreach my $w (@{ $ret[2] }) {
740 if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
741 elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
a7fbee98 742 }
45394607 743 }
9f1f04a1
RGS
744
745 if ($self->{backwardsFlag}) {
746 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
747 if ($self->{backwardsFlag} & (1 << $v)) {
748 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
749 }
750 }
751 }
752
4d36a948 753 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607
JH
754}
755
756
757##
d16e9e3d 758## int compare = cmp(string a, string b)
45394607 759##
5398038e
TS
760sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
761sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
762sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
763sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
764sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
765sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
766sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607
JH
767
768##
d16e9e3d 769## list[strings] sorted = sort(list[strings] arg)
45394607 770##
a7fbee98
JH
771sub sort {
772 my $obj = shift;
773 return
774 map { $_->[1] }
775 sort{ $a->[0] cmp $b->[0] }
776 map [ $obj->getSortKey($_), $_ ], @_;
45394607
JH
777}
778
0116f5dc 779
00e00351
CBW
780sub _derivCE_20 {
781 my $u = shift;
782 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52) ? 0xFB40 : # CJK
783 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
784 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
785 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) ? 0xFB80 # CJK ext.
786 : 0xFBC0; # others
787 my $aaaa = $base + ($u >> 15);
788 my $bbbb = ($u & 0x7FFF) | 0x8000;
789 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
790 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
791}
792
74b94a79 793sub _derivCE_18 {
3756e7ca 794 my $u = shift;
74b94a79
CBW
795 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF51) ? 0xFB40 : # CJK
796 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
797 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80 # CJK ext.
798 : 0xFBC0; # others
799 my $aaaa = $base + ($u >> 15);
800 my $bbbb = ($u & 0x7FFF) | 0x8000;
801 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
802 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
803}
3756e7ca 804
74b94a79
CBW
805sub _derivCE_14 {
806 my $u = shift;
807 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF41) ? 0xFB40 : # CJK
808 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
809 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80 # CJK ext.
810 : 0xFBC0; # others
3756e7ca
RGS
811 my $aaaa = $base + ($u >> 15);
812 my $bbbb = ($u & 0x7FFF) | 0x8000;
74b94a79
CBW
813 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
814 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
3756e7ca
RGS
815}
816
4d36a948 817sub _derivCE_9 {
0116f5dc 818 my $u = shift;
74b94a79
CBW
819 my $base = (CJK_UidIni <= $u && $u <= CJK_UidFin) ? 0xFB40 : # CJK
820 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
821 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80 # CJK ext.
822 : 0xFBC0; # others
0116f5dc
JH
823 my $aaaa = $base + ($u >> 15);
824 my $bbbb = ($u & 0x7FFF) | 0x8000;
74b94a79
CBW
825 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
826 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc
JH
827}
828
4d36a948 829sub _derivCE_8 {
0116f5dc
JH
830 my $code = shift;
831 my $aaaa = 0xFF80 + ($code >> 15);
832 my $bbbb = ($code & 0x7FFF) | 0x8000;
833 return
4d36a948
TS
834 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
835 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607
JH
836}
837
3756e7ca
RGS
838sub _uideoCE_8 {
839 my $u = shift;
840 return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
841}
842
843sub _isUIdeo {
844 my ($u, $uca_vers) = @_;
74b94a79 845 return((CJK_UidIni <= $u && (
00e00351 846 $uca_vers >= 20 ? ($u <= CJK_UidF52) :
74b94a79
CBW
847 $uca_vers >= 18 ? ($u <= CJK_UidF51) :
848 $uca_vers >= 14 ? ($u <= CJK_UidF41) :
849 ($u <= CJK_UidFin)))
3756e7ca
RGS
850 ||
851 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
852 ||
853 (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
00e00351
CBW
854 ||
855 ($uca_vers >= 20 &&
856 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
3756e7ca
RGS
857 );
858}
859
91ae00cb 860
abd1ec54 861sub getWtHangulTerm {
91ae00cb 862 my $self = shift;
abd1ec54
NC
863 return _varCE($self->{variable},
864 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
91ae00cb
NC
865}
866
867
45394607
JH
868##
869## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
870##
a7fbee98 871sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 872
a7fbee98 873#
4d36a948 874# $code *must* be in Hangul syllable.
a7fbee98
JH
875# Check it before you enter here.
876#
5398038e
TS
877sub _decompHangul {
878 my $code = shift;
3756e7ca
RGS
879 my $si = $code - Hangul_SBase;
880 my $li = int( $si / Hangul_NCount);
881 my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
882 my $ti = $si % Hangul_TCount;
5398038e 883 return (
3756e7ca
RGS
884 Hangul_LBase + $li,
885 Hangul_VBase + $vi,
886 $ti ? (Hangul_TBase + $ti) : (),
5398038e 887 );
45394607
JH
888}
889
10d7ec48 890sub _isIllegal {
4d36a948
TS
891 my $code = shift;
892 return ! defined $code # removed
893 || ($code < 0 || 0x10FFFF < $code) # out of range
894 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
895 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
896 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
897 ;
898}
899
91ae00cb
NC
900# Hangul Syllable Type
901sub getHST {
902 my $u = shift;
00e00351
CBW
903 my $vers = shift || 0;
904
905 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
906 return +($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV";
907 }
908
909 if ($vers < 20) {
910 return Hangul_LIni <= $u && $u <= Hangul_LFin ||
911 $u == Hangul_LFill ? "L" :
912 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
913 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" : "";
914 } else {
915 return Hangul_LIni <= $u && $u <= Hangul_LEnd ||
916 HangulL2Ini <= $u && $u <= HangulL2Fin ? "L" :
917 Hangul_VIni <= $u && $u <= Hangul_VEnd ||
918 HangulV2Ini <= $u && $u <= HangulV2Fin ? "V" :
919 Hangul_TIni <= $u && $u <= Hangul_TEnd ||
920 HangulT2Ini <= $u && $u <= HangulT2Fin ? "T" : "";
921 }
91ae00cb
NC
922}
923
4d36a948
TS
924
925##
926## bool _nonIgnorAtLevel(arrayref weights, int level)
927##
928sub _nonIgnorAtLevel($$)
929{
930 my $wt = shift;
931 return if ! defined $wt;
932 my $lv = shift;
9f1f04a1 933 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948
TS
934}
935
936##
937## bool _eqArray(
938## arrayref of arrayref[weights] source,
939## arrayref of arrayref[weights] substr,
940## int level)
941## * comparison of graphemes vs graphemes.
942## @$source >= @$substr must be true (check it before call this);
943##
944sub _eqArray($$$)
945{
946 my $source = shift;
947 my $substr = shift;
948 my $lev = shift;
949
950 for my $g (0..@$substr-1){
951 # Do the $g'th graphemes have the same number of AV weigths?
952 return if @{ $source->[$g] } != @{ $substr->[$g] };
953
954 for my $w (0..@{ $substr->[$g] }-1) {
955 for my $v (0..$lev-1) {
956 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
957 }
958 }
959 }
960 return 1;
961}
962
963##
964## (int position, int length)
965## int position = index(string, substring, position, [undoc'ed grobal])
966##
967## With "grobal" (only for the list context),
968## returns list of arrayref[position, length].
969##
970sub index
971{
91ae00cb
NC
972 my $self = shift;
973 my $str = shift;
974 my $len = length($str);
975 my $subE = $self->splitEnt(shift);
976 my $pos = @_ ? shift : 0;
977 $pos = 0 if $pos < 0;
978 my $grob = shift;
979
980 my $lev = $self->{level};
3756e7ca
RGS
981 my $v2i = $self->{UCA_Version} >= 9 &&
982 $self->{variable} ne 'non-ignorable';
91ae00cb
NC
983
984 if (! @$subE) {
4d36a948
TS
985 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
986 return $grob
987 ? map([$_, 0], $temp..$len)
988 : wantarray ? ($temp,0) : $temp;
989 }
abd1ec54
NC
990 $len < $pos
991 and return wantarray ? () : NOMATCHPOS;
91ae00cb 992 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
abd1ec54
NC
993 @$strE
994 or return wantarray ? () : NOMATCHPOS;
995
4d36a948
TS
996 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
997
abd1ec54
NC
998 my $last_is_variable;
999 for my $vwt (map $self->getWt($_), @$subE) {
1000 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
1001 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 1002
3756e7ca
RGS
1003 # "Ignorable (L1, L2) after Variable" since track. v. 9
1004 if ($v2i) {
abd1ec54
NC
1005 if ($var) {
1006 $last_is_variable = TRUE;
1007 }
1008 elsif (!$wt[0]) { # ignorable
4d36a948 1009 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
1010 }
1011 else {
1012 $last_is_variable = FALSE;
4d36a948
TS
1013 }
1014 }
1015
abd1ec54
NC
1016 if (@subWt && !$var && !$wt[0]) {
1017 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
4d36a948 1018 } else {
abd1ec54 1019 push @subWt, [ \@wt ];
4d36a948
TS
1020 }
1021 }
1022
1023 my $count = 0;
91ae00cb 1024 my $end = @$strE - 1;
4d36a948 1025
abd1ec54 1026 $last_is_variable = FALSE; # reuse
4d36a948
TS
1027 for (my $i = 0; $i <= $end; ) { # no $i++
1028 my $found_base = 0;
1029
1030 # fetch a grapheme
1031 while ($i <= $end && $found_base == 0) {
abd1ec54
NC
1032 for my $vwt ($self->getWt($strE->[$i][0])) {
1033 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
1034 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 1035
3756e7ca
RGS
1036 # "Ignorable (L1, L2) after Variable" since track. v. 9
1037 if ($v2i) {
abd1ec54
NC
1038 if ($var) {
1039 $last_is_variable = TRUE;
1040 }
1041 elsif (!$wt[0]) { # ignorable
4d36a948 1042 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
1043 }
1044 else {
1045 $last_is_variable = FALSE;
4d36a948
TS
1046 }
1047 }
1048
abd1ec54
NC
1049 if (@strWt && !$var && !$wt[0]) {
1050 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
91ae00cb 1051 $finPos[-1] = $strE->[$i][2];
4d36a948 1052 } elsif ($to_be_pushed) {
abd1ec54 1053 push @strWt, [ \@wt ];
91ae00cb 1054 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 1055 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 1056 push @finPos, $strE->[$i][2];
4d36a948
TS
1057 $found_base++;
1058 }
1059 # else ===> no-op
1060 }
1061 $i++;
1062 }
1063
1064 # try to match
1065 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1066 if ($iniPos[0] != NOMATCHPOS &&
1067 $finPos[$#subWt] != NOMATCHPOS &&
1068 _eqArray(\@strWt, \@subWt, $lev)) {
1069 my $temp = $iniPos[0] + $pos;
1070
1071 if ($grob) {
1072 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1073 splice @strWt, 0, $#subWt;
1074 splice @iniPos, 0, $#subWt;
1075 splice @finPos, 0, $#subWt;
1076 }
1077 else {
1078 return wantarray
1079 ? ($temp, $finPos[$#subWt] - $iniPos[0])
1080 : $temp;
1081 }
1082 }
1083 shift @strWt;
1084 shift @iniPos;
1085 shift @finPos;
1086 }
1087 }
1088
1089 return $grob
1090 ? @g_ret
1091 : wantarray ? () : NOMATCHPOS;
1092}
1093
1094##
1095## scalarref to matching part = match(string, substring)
1096##
1097sub match
1098{
1099 my $self = shift;
1100 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1101 my $temp = substr($_[0], $pos, $len);
1102 return wantarray ? $temp : \$temp;
1103 # An lvalue ref \substr should be avoided,
1104 # since its value is affected by modification of its referent.
1105 }
1106 else {
1107 return;
1108 }
1109}
1110
1111##
1112## arrayref matching parts = gmatch(string, substring)
1113##
1114sub gmatch
1115{
1116 my $self = shift;
1117 my $str = shift;
1118 my $sub = shift;
1119 return map substr($str, $_->[0], $_->[1]),
1120 $self->index($str, $sub, 0, 'g');
1121}
1122
1123##
1124## bool subst'ed = subst(string, substring, replace)
1125##
1126sub subst
1127{
1128 my $self = shift;
1129 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1130
1131 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1132 if ($code) {
1133 my $mat = substr($_[0], $pos, $len);
1134 substr($_[0], $pos, $len, $code->($mat));
1135 } else {
1136 substr($_[0], $pos, $len, $_[2]);
1137 }
1138 return TRUE;
1139 }
1140 else {
1141 return FALSE;
1142 }
1143}
1144
1145##
1146## int count = gsubst(string, substring, replace)
1147##
1148sub gsubst
1149{
1150 my $self = shift;
1151 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1152 my $cnt = 0;
1153
1154 # Replacement is carried out from the end, then use reverse.
1155 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1156 if ($code) {
1157 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1158 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1159 } else {
1160 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1161 }
1162 $cnt++;
1163 }
1164 return $cnt;
1165}
1166
45394607
JH
11671;
1168__END__
1169
1170=head1 NAME
1171
a7fbee98 1172Unicode::Collate - Unicode Collation Algorithm
45394607
JH
1173
1174=head1 SYNOPSIS
1175
1176 use Unicode::Collate;
1177
1178 #construct
5398038e 1179 $Collator = Unicode::Collate->new(%tailoring);
45394607
JH
1180
1181 #sort
5398038e 1182 @sorted = $Collator->sort(@not_sorted);
45394607
JH
1183
1184 #compare
a7fbee98 1185 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1186
91ae00cb
NC
1187 # If %tailoring is false (i.e. empty),
1188 # $Collator should do the default collation.
1189
45394607
JH
1190=head1 DESCRIPTION
1191
3756e7ca
RGS
1192This module is an implementation of Unicode Technical Standard #10
1193(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
4d36a948 1194
45394607
JH
1195=head2 Constructor and Tailoring
1196
d16e9e3d
JH
1197The C<new> method returns a collator object.
1198
5398038e 1199 $Collator = Unicode::Collate->new(
0116f5dc 1200 UCA_Version => $UCA_Version,
91ae00cb 1201 alternate => $alternate, # deprecated: use of 'variable' is recommended.
45394607
JH
1202 backwards => $levelNumber, # or \@levelNumbers
1203 entry => $element,
91ae00cb 1204 hangul_terminator => $term_primary_weight,
45394607
JH
1205 ignoreName => qr/$ignoreName/,
1206 ignoreChar => qr/$ignoreChar/,
1207 katakana_before_hiragana => $bool,
1208 level => $collationLevel,
91ae00cb 1209 normalization => $normalization_form,
45394607
JH
1210 overrideCJK => \&overrideCJK,
1211 overrideHangul => \&overrideHangul,
1212 preprocess => \&preprocess,
1213 rearrange => \@charList,
1214 table => $filename,
1215 undefName => qr/$undefName/,
1216 undefChar => qr/$undefChar/,
1217 upper_before_lower => $bool,
91ae00cb 1218 variable => $variable,
45394607 1219 );
45394607
JH
1220
1221=over 4
1222
0116f5dc
JH
1223=item UCA_Version
1224
3756e7ca
RGS
1225If the tracking version number of UCA is given,
1226behavior of that tracking version is emulated on collating.
0116f5dc 1227If omitted, the return value of C<UCA_Version()> is used.
3756e7ca
RGS
1228C<UCA_Version()> should return the latest tracking version supported.
1229
00e00351 1230The supported tracking version: 8, 9, 11, 14, 16, 18 or 20.
3756e7ca 1231
0d50d293
RGS
1232 UCA Unicode Standard DUCET (@version)
1233 ---------------------------------------------------
1234 8 3.1 3.0.1 (3.0.1d9)
1235 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1236 11 4.0 4.0.0 (4.0.0)
1237 14 4.1.0 4.1.0 (4.1.0)
74b94a79
CBW
1238 16 5.0 5.0.0 (5.0.0)
1239 18 5.1.0 5.1.0 (5.1.0)
00e00351 1240 20 5.2.0 5.2.0 (5.2.0)
3756e7ca
RGS
1241
1242Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1243
1244=item alternate
0116f5dc 1245
3756e7ca 1246-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
0116f5dc 1247
3756e7ca
RGS
1248For backward compatibility, C<alternate> (old name) can be used
1249as an alias for C<variable>.
0116f5dc 1250
45394607
JH
1251=item backwards
1252
4d36a948 1253-- see 3.1.2 French Accents, UTS #10.
45394607
JH
1254
1255 backwards => $levelNumber or \@levelNumbers
1256
1257Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1258If omitted, forwards at all the levels.
1259
1260=item entry
1261
4d36a948 1262-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1263
91ae00cb
NC
1264If the same character (or a sequence of characters) exists
1265in the collation element table through C<table>,
1266mapping to collation elements is overrided.
1267If it does not exist, the mapping is defined additionally.
45394607 1268
abd1ec54
NC
1269 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
12700063 0068 ; [.0E6A.0020.0002.0063] # ch
12710043 0068 ; [.0E6A.0020.0007.0043] # Ch
12720043 0048 ; [.0E6A.0020.0008.0043] # CH
1273006C 006C ; [.0F4C.0020.0002.006C] # ll
1274004C 006C ; [.0F4C.0020.0007.004C] # Ll
1275004C 004C ; [.0F4C.0020.0008.004C] # LL
e7f779c8
RGS
127600F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1277006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
127800D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1279004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
abd1ec54
NC
1280ENTRY
1281
1282 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
128300E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
128400C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1285ENTRY
45394607 1286
4d36a948 1287B<NOTE:> The code point in the UCA file format (before C<';'>)
abd1ec54
NC
1288B<must> be a Unicode code point (defined as hexadecimal),
1289but not a native code point.
4d36a948
TS
1290So C<0063> must always denote C<U+0063>,
1291but not a character of C<"\x63">.
1292
abd1ec54
NC
1293Weighting may vary depending on collation element table.
1294So ensure the weights defined in C<entry> will be consistent with
1295those in the collation element table loaded via C<table>.
1296
1297In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1298and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1299(as a value between C<0E60> and C<0E6D>)
1300makes ordering as C<C E<lt> CH E<lt> D>.
1301Exactly speaking DUCET already has some characters between C<C> and C<D>:
1302C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1303C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1304and C<c-curl> (C<U+0255>) with C<0E69>.
1305Then primary weight C<0E6A> for C<CH> makes C<CH>
1306ordered between C<c-curl> and C<D>.
1307
91ae00cb
NC
1308=item hangul_terminator
1309
3756e7ca 1310-- see 7.1.4 Trailing Weights, UTS #10.
91ae00cb
NC
1311
1312If a true value is given (non-zero but should be positive),
1313it will be added as a terminator primary weight to the end of
1314every standard Hangul syllable. Secondary and any higher weights
1315for terminator are set to zero.
1316If the value is false or C<hangul_terminator> key does not exist,
1317insertion of terminator weights will not be performed.
1318
1319Boundaries of Hangul syllables are determined
1320according to conjoining Jamo behavior in F<the Unicode Standard>
1321and F<HangulSyllableType.txt>.
1322
1323B<Implementation Note:>
1324(1) For expansion mapping (Unicode character mapped
1325to a sequence of collation elements), a terminator will not be added
1326between collation elements, even if Hangul syllable boundary exists there.
1327Addition of terminator is restricted to the next position
1328to the last collation element.
1329
1330(2) Non-conjoining Hangul letters
1331(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1332automatically terminated with a terminator primary weight.
1333These characters may need terminator included in a collation element
1334table beforehand.
1335
45394607
JH
1336=item ignoreChar
1337
3756e7ca
RGS
1338=item ignoreName
1339
1340-- see 3.2.2 Variable Weighting, UTS #10.
45394607 1341
caffd4cf
TS
1342Makes the entry in the table completely ignorable;
1343i.e. as if the weights were zero at all level.
45394607 1344
3756e7ca
RGS
1345Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1346will be ignored. Through C<ignoreName>, any character whose name
1347(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1348will be ignored.
1349
a7fbee98 1350E.g. when 'a' and 'e' are ignorable,
45394607
JH
1351'element' is equal to 'lament' (or 'lmnt').
1352
3756e7ca
RGS
1353=item katakana_before_hiragana
1354
1355-- see 7.3.1 Tertiary Weight Table, UTS #10.
1356
1357By default, hiragana is before katakana.
1358If the parameter is made true, this is reversed.
1359
1360B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1361distinctions must occur in level 3, and their weights at level 3 must be
1362same as those mentioned in 7.3.1, UTS #10.
1363If you define your collation elements which violate this requirement,
1364this parameter does not work validly.
1365
45394607
JH
1366=item level
1367
3756e7ca 1368-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1369
1370Set the maximum level.
1371Any higher levels than the specified one are ignored.
1372
1373 Level 1: alphabetic ordering
1374 Level 2: diacritic ordering
1375 Level 3: case ordering
91ae00cb 1376 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607
JH
1377
1378 ex.level => 2,
1379
a7fbee98
JH
1380If omitted, the maximum is the 4th.
1381
45394607
JH
1382=item normalization
1383
3756e7ca 1384-- see 4.1 Normalize, UTS #10.
45394607 1385
905aa9f0 1386If specified, strings are normalized before preparation of sort keys
45394607
JH
1387(the normalization is executed after preprocess).
1388
1d2654e1
JH
1389A form name C<Unicode::Normalize::normalize()> accepts will be applied
1390as C<$normalization_form>.
06c8fc8f 1391Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1
JH
1392See C<Unicode::Normalize::normalize()> for detail.
1393If omitted, C<'NFD'> is used.
45394607 1394
91ae00cb 1395C<normalization> is performed after C<preprocess> (if defined).
45394607 1396
06c8fc8f
RGS
1397Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1398though they are not concerned with C<Unicode::Normalize::normalize()>.
1399
1400If C<undef> (not a string C<"undef">) is passed explicitly
1401as the value for this key,
45394607 1402any normalization is not carried out (this may make tailoring easier
abd1ec54
NC
1403if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1404only contiguous contractions are resolved;
1405e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1406C<A-cedilla-ring> would be primary equal to C<A>.
06c8fc8f
RGS
1407In this point,
1408C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1409B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1410
1411In the case of C<(normalization =E<gt> "prenormalized")>,
1412any normalization is not performed, but
1413non-contiguous contractions with combining characters are performed.
1414Therefore
1415C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1416B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1417If source strings are finely prenormalized,
1418C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1419
1420Except C<(normalization =E<gt> undef)>,
1421B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607
JH
1422
1423=item overrideCJK
1424
4d36a948 1425-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1426
74b94a79 1427By default, CJK Unified Ideographs are ordered in Unicode codepoint
00e00351
CBW
1428order but C<CJK Unified Ideographs> are lesser than
1429C<CJK Unified Ideographs Extension>.
1430
1431 CJK Unified Ideographs:
1432 U+4E00..U+9FA5 if UCA_Version is 8 to 11;
1433 U+4E00..U+9FBB if UCA_Version is 14 to 16;
1434 U+4E00..U+9FC3 if UCA_Version is 18;
1435 U+4E00..U+9FCB if UCA_Version> is 20.
1436
1437 CJK Unified Ideographs Extension:
1438 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) if UCA_Version < 20;
1439 Ext.A, Ext.B and Ext.C (U+2A700..U+2B734) if UCA_Version is 20.
91ae00cb
NC
1440
1441Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
45394607 1442
a7fbee98 1443ex. CJK Unified Ideographs in the JIS code point order.
45394607
JH
1444
1445 overrideCJK => sub {
a7fbee98
JH
1446 my $u = shift; # get a Unicode codepoint
1447 my $b = pack('n', $u); # to UTF-16BE
1448 my $s = your_unicode_to_sjis_converter($b); # convert
1449 my $n = unpack('n', $s); # convert sjis to short
1450 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607
JH
1451 },
1452
a7fbee98
JH
1453ex. ignores all CJK Unified Ideographs.
1454
1455 overrideCJK => sub {()}, # CODEREF returning empty list
1456
1457 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1458 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1459
1460If C<undef> is passed explicitly as the value for this key,
1461weights for CJK Unified Ideographs are treated as undefined.
1462But assignment of weight for CJK Unified Ideographs
91ae00cb 1463in table or C<entry> is still valid.
a7fbee98
JH
1464
1465=item overrideHangul
1466
4d36a948 1467-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1468
abd1ec54
NC
1469By default, Hangul Syllables are decomposed into Hangul Jamo,
1470even if C<(normalization =E<gt> undef)>.
a7fbee98
JH
1471But the mapping of Hangul Syllables may be overrided.
1472
3756e7ca 1473This parameter works like C<overrideCJK>, so see there for examples.
a7fbee98 1474
45394607 1475If you want to override the mapping of Hangul Syllables,
abd1ec54
NC
1476NFD, NFKD, and FCD are not appropriate,
1477since they will decompose Hangul Syllables before overriding.
45394607 1478
a7fbee98
JH
1479If C<undef> is passed explicitly as the value for this key,
1480weight for Hangul Syllables is treated as undefined
1481without decomposition into Hangul Jamo.
1482But definition of weight for Hangul Syllables
91ae00cb 1483in table or C<entry> is still valid.
a7fbee98 1484
45394607
JH
1485=item preprocess
1486
4d36a948 1487-- see 5.1 Preprocessing, UTS #10.
45394607
JH
1488
1489If specified, the coderef is used to preprocess
1490before the formation of sort keys.
1491
a7fbee98 1492ex. dropping English articles, such as "a" or "the".
45394607
JH
1493Then, "the pen" is before "a pencil".
1494
1495 preprocess => sub {
1496 my $str = shift;
a7fbee98 1497 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1498 return $str;
45394607
JH
1499 },
1500
91ae00cb 1501C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1502
45394607
JH
1503=item rearrange
1504
4d36a948 1505-- see 3.1.3 Rearrangement, UTS #10.
45394607
JH
1506
1507Characters that are not coded in logical order and to be rearranged.
3756e7ca 1508If C<UCA_Version> is equal to or lesser than 11, default is:
45394607
JH
1509
1510 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1511
3756e7ca
RGS
1512If you want to disallow any rearrangement, pass C<undef> or C<[]>
1513(a reference to empty list) as the value for this key.
1514
74b94a79
CBW
1515If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1516(i.e. no rearrangement).
a7fbee98 1517
0116f5dc
JH
1518B<According to the version 9 of UCA, this parameter shall not be used;
1519but it is not warned at present.>
1520
45394607
JH
1521=item table
1522
4d36a948 1523-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1524
91ae00cb 1525You can use another collation element table if desired.
45394607 1526
e7f779c8 1527The table file should locate in the F<Unicode/Collate> directory
3756e7ca
RGS
1528on C<@INC>. Say, if the filename is F<Foo.txt>,
1529the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
e7f779c8
RGS
1530
1531By default, F<allkeys.txt> (as the filename of DUCET) is used.
0d50d293
RGS
1532If you will prepare your own table file, any name other than F<allkeys.txt>
1533may be better to avoid namespace conflict.
45394607 1534
00e00351
CBW
1535B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1536module, and it may save time at the run time.
1537Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
1538or using C<ignoreChar>, C<ignoreName>, C<undefChar>, or C<undefName>
1539will prevent this module using the compiled DUCET.
1540
a7fbee98 1541If C<undef> is passed explicitly as the value for this key,
91ae00cb 1542no file is read (but you can define collation elements via C<entry>).
a7fbee98
JH
1543
1544A typical way to define a collation element table
1545without any file of table:
1546
1547 $onlyABC = Unicode::Collate->new(
1548 table => undef,
1549 entry => << 'ENTRIES',
15500061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
15510041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
15520062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
15530042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
15540063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
15550043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1556ENTRIES
1557 );
905aa9f0 1558
3756e7ca
RGS
1559If C<ignoreName> or C<undefName> is used, character names should be
1560specified as a comment (following C<#>) on each line.
45394607
JH
1561
1562=item undefChar
1563
3756e7ca
RGS
1564=item undefName
1565
4d36a948 1566-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607
JH
1567
1568Undefines the collation element as if it were unassigned in the table.
1569This reduces the size of the table.
1570If an unassigned character appears in the string to be collated,
1571the sort key is made from its codepoint
1572as a single-character collation element,
1573as it is greater than any other assigned collation elements
1574(in the codepoint order among the unassigned characters).
1575But, it'd be better to ignore characters
1576unfamiliar to you and maybe never used.
1577
3756e7ca
RGS
1578Through C<undefChar>, any character matching C<qr/$undefChar/>
1579will be undefined. Through C<undefName>, any character whose name
1580(given in the C<table> file as a comment) matches C<qr/$undefName/>
1581will be undefined.
1582
e7f779c8
RGS
1583ex. Collation weights for beyond-BMP characters are not stored in object:
1584
1585 undefChar => qr/[^\0-\x{fffd}]/,
1586
45394607
JH
1587=item upper_before_lower
1588
3756e7ca 1589-- see 6.6 Case Comparisons, UTS #10.
45394607 1590
3756e7ca
RGS
1591By default, lowercase is before uppercase.
1592If the parameter is made true, this is reversed.
45394607 1593
3756e7ca
RGS
1594B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1595distinctions must occur in level 3, and their weights at level 3 must be
1596same as those mentioned in 7.3.1, UTS #10.
1597If you define your collation elements which differs from this requirement,
1598this parameter doesn't work validly.
45394607 1599
91ae00cb
NC
1600=item variable
1601
91ae00cb
NC
1602-- see 3.2.2 Variable Weighting, UTS #10.
1603
91ae00cb
NC
1604This key allows to variable weighting for variable collation elements,
1605which are marked with an ASTERISK in the table
1606(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1607
1608 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1609
1610These names are case-insensitive.
1611By default (if specification is omitted), 'shifted' is adopted.
1612
1613 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1614 considered at the 4th level.
1615
abd1ec54 1616 'Non-Ignorable' Variable elements are not reset to ignorable.
91ae00cb
NC
1617
1618 'Shifted' Variable elements are made ignorable at levels 1 through 3
1619 their level 4 weight is replaced by the old level 1 weight.
1620 Level 4 weight for Non-Variable elements is 0xFFFF.
1621
1622 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1623 are trimmed.
1624
45394607
JH
1625=back
1626
3164dd77 1627=head2 Methods for Collation
45394607
JH
1628
1629=over 4
1630
5398038e 1631=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607
JH
1632
1633Sorts a list of strings.
1634
5398038e 1635=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607
JH
1636
1637Returns 1 (when C<$a> is greater than C<$b>)
1638or 0 (when C<$a> is equal to C<$b>)
1639or -1 (when C<$a> is lesser than C<$b>).
1640
5398038e
TS
1641=item C<$result = $Collator-E<gt>eq($a, $b)>
1642
1643=item C<$result = $Collator-E<gt>ne($a, $b)>
1644
1645=item C<$result = $Collator-E<gt>lt($a, $b)>
1646
1647=item C<$result = $Collator-E<gt>le($a, $b)>
1648
1649=item C<$result = $Collator-E<gt>gt($a, $b)>
1650
1651=item C<$result = $Collator-E<gt>ge($a, $b)>
1652
a7fbee98 1653They works like the same name operators as theirs.
5398038e
TS
1654
1655 eq : whether $a is equal to $b.
1656 ne : whether $a is not equal to $b.
1657 lt : whether $a is lesser than $b.
1658 le : whether $a is lesser than $b or equal to $b.
1659 gt : whether $a is greater than $b.
1660 ge : whether $a is greater than $b or equal to $b.
1661
1662=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1663
3756e7ca 1664-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1665
1666Returns a sort key.
1667
1668You compare the sort keys using a binary comparison
1669and get the result of the comparison of the strings using UCA.
1670
5398038e 1671 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607
JH
1672
1673 is equivalent to
1674
5398038e 1675 $Collator->cmp($a, $b)
45394607 1676
a7fbee98
JH
1677=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1678
3756e7ca
RGS
1679Converts a sorting key into its representation form.
1680If C<UCA_Version> is 8, the output is slightly different.
1681
a7fbee98
JH
1682 use Unicode::Collate;
1683 my $c = Unicode::Collate->new();
1684 print $c->viewSortKey("Perl"),"\n";
1685
0116f5dc
JH
1686 # output:
1687 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1688 # Level 1 Level 2 Level 3 Level 4
1689
4d36a948
TS
1690=back
1691
1692=head2 Methods for Searching
d16e9e3d 1693
3756e7ca 1694B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
4d36a948
TS
1695for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1696C<subst>, C<gsubst>) is croaked,
1697as the position and the length might differ
1698from those on the specified string.
3756e7ca 1699(And C<rearrange> and C<hangul_terminator> parameters are neglected.)
d16e9e3d 1700
4d36a948
TS
1701The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1702like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1703but they are not aware of any pattern, but only a literal substring.
1704
1705=over 4
1706
1707=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1708
1709=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d
JH
1710
1711If C<$substring> matches a part of C<$string>, returns
1712the position of the first occurrence of the matching part in scalar context;
1713in list context, returns a two-element list of
1714the position and the length of the matching part.
1715
d16e9e3d
JH
1716If C<$substring> does not match any part of C<$string>,
1717returns C<-1> in scalar context and
1718an empty list in list context.
1719
1720e.g. you say
1721
5398038e 1722 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948
TS
1723 # (normalization => undef) is REQUIRED.
1724 my $str = "Ich muß studieren Perl.";
1725 my $sub = "MÃœSS";
d16e9e3d 1726 my $match;
a7fbee98 1727 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1728 $match = substr($str, $pos, $len);
d16e9e3d
JH
1729 }
1730
4d36a948 1731and get C<"muß"> in C<$match> since C<"muß">
3756e7ca 1732is primary equal to C<"MÃœSS">.
4d36a948
TS
1733
1734=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1735
1736=item C<($match) = $Collator-E<gt>match($string, $substring)>
1737
1738If C<$substring> matches a part of C<$string>, in scalar context, returns
1739B<a reference to> the first occurrence of the matching part
1740(C<$match_ref> is always true if matches,
1741since every reference is B<true>);
1742in list context, returns the first occurrence of the matching part.
1743
1744If C<$substring> does not match any part of C<$string>,
1745returns C<undef> in scalar context and
1746an empty list in list context.
1747
1748e.g.
1749
1750 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1751 print "matches [$$match_ref].\n";
1752 } else {
1753 print "doesn't match.\n";
1754 }
1755
3756e7ca 1756 or
4d36a948
TS
1757
1758 if (($match) = $Collator->match($str, $sub)) { # list context
1759 print "matches [$match].\n";
1760 } else {
1761 print "doesn't match.\n";
1762 }
1763
1764=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1765
1766If C<$substring> matches a part of C<$string>, returns
1767all the matching parts (or matching count in scalar context).
1768
1769If C<$substring> does not match any part of C<$string>,
1770returns an empty list.
1771
1772=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1773
1774If C<$substring> matches a part of C<$string>,
1775the first occurrence of the matching part is replaced by C<$replacement>
1776(C<$string> is modified) and return C<$count> (always equals to C<1>).
1777
1778C<$replacement> can be a C<CODEREF>,
1779taking the matching part as an argument,
1780and returning a string to replace the matching part
1781(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1782
1783=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1784
1785If C<$substring> matches a part of C<$string>,
1786all the occurrences of the matching part is replaced by C<$replacement>
1787(C<$string> is modified) and return C<$count>.
1788
1789C<$replacement> can be a C<CODEREF>,
1790taking the matching part as an argument,
1791and returning a string to replace the matching part
1792(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1793
1794e.g.
1795
1796 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1797 # (normalization => undef) is REQUIRED.
3756e7ca 1798 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
4d36a948
TS
1799 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1800
3756e7ca 1801 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
4d36a948 1802 # i.e., all the camels are made bold-faced.
d16e9e3d 1803
45394607
JH
1804=back
1805
3164dd77
TS
1806=head2 Other Methods
1807
1808=over 4
1809
0116f5dc
JH
1810=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1811
1812Change the value of specified keys and returns the changed part.
1813
1814 $Collator = Unicode::Collate->new(level => 4);
1815
1816 $Collator->eq("perl", "PERL"); # false
1817
1818 %old = $Collator->change(level => 2); # returns (level => 4).
1819
1820 $Collator->eq("perl", "PERL"); # true
1821
1822 $Collator->change(%old); # returns (level => 2).
1823
1824 $Collator->eq("perl", "PERL"); # false
1825
1826Not all C<(key,value)>s are allowed to be changed.
1827See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1828
1829In the scalar context, returns the modified collator
1830(but it is B<not> a clone from the original).
1831
1832 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1833
1834 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1835
1836 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1837
91ae00cb 1838=item C<$version = $Collator-E<gt>version()>
3164dd77 1839
91ae00cb
NC
1840Returns the version number (a string) of the Unicode Standard
1841which the C<table> file used by the collator object is based on.
1842If the table does not include a version line (starting with C<@version>),
1843returns C<"unknown">.
1844
1845=item C<UCA_Version()>
3164dd77 1846
91ae00cb 1847Returns the tracking version number of UTS #10 this module consults.
3164dd77 1848
91ae00cb
NC
1849=item C<Base_Unicode_Version()>
1850
1851Returns the version number of UTS #10 this module consults.
3164dd77
TS
1852
1853=back
1854
3756e7ca
RGS
1855=head1 EXPORT
1856
1857No method will be exported.
45394607 1858
0d50d293
RGS
1859=head1 INSTALL
1860
1861Though this module can be used without any C<table> file,
1862to use this module easily, it is recommended to install a table file
1863in the UCA format, by copying it under the directory
1864<a place in @INC>/Unicode/Collate.
1865
6d24ed10
SP
1866The most preferable one is "The Default Unicode Collation Element Table"
1867(aka DUCET), available from the Unicode Consortium's website:
0d50d293
RGS
1868
1869 http://www.unicode.org/Public/UCA/
1870
1871 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1872
1873If DUCET is not installed, it is recommended to copy the file
1874from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1875to <a place in @INC>/Unicode/Collate/allkeys.txt
1876manually.
1877
3756e7ca
RGS
1878=head1 CAVEATS
1879
1880=over 4
45394607 1881
3756e7ca 1882=item Normalization
45394607 1883
3756e7ca
RGS
1884Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1885module (see L<Unicode::Normalize>).
45394607 1886
5398038e 1887If you need not it (say, in the case when you need not
45394607
JH
1888handle any combining characters),
1889assign C<normalization =E<gt> undef> explicitly.
1890
4d36a948 1891-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 1892
3756e7ca 1893=item Conformance Test
0116f5dc 1894
10d7ec48
NC
1895The Conformance Test for the UCA is available
1896under L<http://www.unicode.org/Public/UCA/>.
0116f5dc
JH
1897
1898For F<CollationTest_SHIFTED.txt>,
1899a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1900for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 1901C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 1902
4d36a948 1903B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 1904
3756e7ca
RGS
1905=back
1906
0d50d293 1907=head1 AUTHOR, COPYRIGHT AND LICENSE
45394607 1908
0d50d293 1909The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
74b94a79 1910<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2010,
0d50d293 1911SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 1912
0d50d293
RGS
1913This module is free software; you can redistribute it and/or
1914modify it under the same terms as Perl itself.
45394607 1915
00e00351
CBW
1916The file Unicode/Collate/allkeys.txt was copied verbatim
1917from L<http://www.unicode.org/Public/UCA/5.2.0/allkeys.txt>.
1918This file is Copyright (c) 1991-2009 Unicode, Inc. All rights reserved.
6d24ed10 1919Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
45394607
JH
1920
1921=head1 SEE ALSO
1922
1923=over 4
1924
91ae00cb
NC
1925=item Unicode Collation Algorithm - UTS #10
1926
1927L<http://www.unicode.org/reports/tr10/>
1928
1929=item The Default Unicode Collation Element Table (DUCET)
1930
10d7ec48 1931L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 1932
91ae00cb 1933=item The conformance test for the UCA
45394607 1934
10d7ec48 1935L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 1936
10d7ec48 1937L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 1938
91ae00cb 1939=item Hangul Syllable Type
0116f5dc 1940
10d7ec48 1941L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 1942
91ae00cb 1943=item Unicode Normalization Forms - UAX #15
a7fbee98 1944
91ae00cb 1945L<http://www.unicode.org/reports/tr15/>
a7fbee98 1946
45394607
JH
1947=back
1948
1949=cut