This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
bump Unicode-Collate version for blead changes
[perl5.git] / cpan / Unicode-Collate / Collate.pm
CommitLineData
45394607
JH
1package Unicode::Collate;
2
4a2e806c 3BEGIN {
ae6aa562 4 unless ("A" eq pack('U', 0x41)) {
9f1f04a1 5 die "Unicode::Collate cannot stringify a Unicode code point\n";
4a2e806c
JH
6 }
7}
8
45394607
JH
9use 5.006;
10use strict;
11use warnings;
12use Carp;
e69a2255 13use File::Spec;
5398038e 14
10d7ec48
NC
15no warnings 'utf8';
16
1a928065 17our $VERSION = '0.6801';
45394607
JH
18our $PACKAGE = __PACKAGE__;
19
e7f779c8
RGS
20my @Path = qw(Unicode Collate);
21my $KeyFile = "allkeys.txt";
45394607 22
4d36a948
ST
23# Perl's boolean
24use constant TRUE => 1;
25use constant FALSE => "";
26use constant NOMATCHPOS => -1;
27
28# A coderef to get combining class imported from Unicode::Normalize
29# (i.e. \&Unicode::Normalize::getCombinClass).
30# This is also used as a HAS_UNICODE_NORMALIZE flag.
e7f779c8 31my $CVgetCombinClass;
4d36a948 32
9f1f04a1
RGS
33# Supported Levels
34use constant MinLevel => 1;
35use constant MaxLevel => 4;
36
4d36a948 37# Minimum weights at level 2 and 3, respectively
9f1f04a1
RGS
38use constant Min2Wt => 0x20;
39use constant Min3Wt => 0x02;
4d36a948
ST
40
41# Shifted weight at 4th level
9f1f04a1 42use constant Shift4Wt => 0xFFFF;
4d36a948 43
4d36a948
ST
44# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45# PROBLEM: The Default Unicode Collation Element Table
46# has weights over 0xFFFF at the 4th level.
47# The tie-breaking in the variable weights
48# other than "shift" (as well as "shift-trimmed") is unreliable.
49use constant VCE_TEMPLATE => 'Cn4';
50
4d36a948
ST
51# A sort key: 16-bit weights
52# See also the PROBLEM on VCE_TEMPLATE above.
53use constant KEY_TEMPLATE => 'n*';
54
55# Level separator in a sort key:
56# i.e. pack(KEY_TEMPLATE, 0)
57use constant LEVEL_SEP => "\0\0";
58
59# As Unicode code point separator for hash keys.
60# A joined code point string (denoted by JCPS below)
61# like "65;768" is used for internal processing
62# instead of Perl's Unicode string like "\x41\x{300}",
63# as the native code point is different from the Unicode code point
64# on EBCDIC platform.
65# This character must not be included in any stringified
66# representation of an integer.
67use constant CODE_SEP => ';';
68
69# boolean values of variable weights
0116f5dc
JH
70use constant NON_VAR => 0; # Non-Variable character
71use constant VAR => 1; # Variable character
3164dd77 72
91ae00cb 73# specific code points
00e00351
CBW
74use constant Hangul_SBase => 0xAC00;
75use constant Hangul_SIni => 0xAC00;
76use constant Hangul_SFin => 0xD7A3;
77use constant Hangul_NCount => 588;
78use constant Hangul_TCount => 28;
91ae00cb
NC
79use constant Hangul_LBase => 0x1100;
80use constant Hangul_LIni => 0x1100;
81use constant Hangul_LFin => 0x1159;
82use constant Hangul_LFill => 0x115F;
b5d9a953 83use constant Hangul_LEnd => 0x115F; # Unicode 5.2
91ae00cb 84use constant Hangul_VBase => 0x1161;
3756e7ca 85use constant Hangul_VIni => 0x1160; # from Vowel Filler
91ae00cb 86use constant Hangul_VFin => 0x11A2;
b5d9a953 87use constant Hangul_VEnd => 0x11A7; # Unicode 5.2
3756e7ca 88use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
91ae00cb
NC
89use constant Hangul_TIni => 0x11A8;
90use constant Hangul_TFin => 0x11F9;
b5d9a953
CBW
91use constant Hangul_TEnd => 0x11FF; # Unicode 5.2
92use constant HangulL2Ini => 0xA960; # Unicode 5.2
93use constant HangulL2Fin => 0xA97C; # Unicode 5.2
94use constant HangulV2Ini => 0xD7B0; # Unicode 5.2
95use constant HangulV2Fin => 0xD7C6; # Unicode 5.2
96use constant HangulT2Ini => 0xD7CB; # Unicode 5.2
97use constant HangulT2Fin => 0xD7FB; # Unicode 5.2
98
99use constant CJK_UidIni => 0x4E00;
100use constant CJK_UidFin => 0x9FA5;
101use constant CJK_UidF41 => 0x9FBB;
102use constant CJK_UidF51 => 0x9FC3;
103use constant CJK_UidF52 => 0x9FCB;
104use constant CJK_ExtAIni => 0x3400; # Unicode 3.0
105use constant CJK_ExtAFin => 0x4DB5; # Unicode 3.0
106use constant CJK_ExtBIni => 0x20000; # Unicode 3.1
107use constant CJK_ExtBFin => 0x2A6D6; # Unicode 3.1
108use constant CJK_ExtCIni => 0x2A700; # Unicode 5.2
109use constant CJK_ExtCFin => 0x2B734; # Unicode 5.2
110use constant CJK_ExtDIni => 0x2B740; # Unicode 6.0
111use constant CJK_ExtDFin => 0x2B81D; # Unicode 6.0
91ae00cb 112
584e761d
CBW
113my %CompatUI = map +($_ => 1), (
114 0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA1F,
115 0xFA21, 0xFA23, 0xFA24, 0xFA27, 0xFA28, 0xFA29,
116);
117
4d36a948 118# Logical_Order_Exception in PropList.txt
e7f779c8 119my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
a7fbee98 120
00e00351 121sub UCA_Version { "20" }
a7fbee98 122
00e00351 123sub Base_Unicode_Version { "5.2.0" }
a7fbee98 124
9f1f04a1
RGS
125######
126
9f1f04a1 127sub pack_U {
ae6aa562 128 return pack('U*', @_);
9f1f04a1
RGS
129}
130
131sub unpack_U {
3756e7ca 132 return unpack('U*', shift(@_).pack('U*'));
9f1f04a1
RGS
133}
134
135######
136
91ae00cb
NC
137my (%VariableOK);
138@VariableOK{ qw/
0116f5dc 139 blanked non-ignorable shifted shift-trimmed
91ae00cb 140 / } = (); # keys lowercased
0116f5dc
JH
141
142our @ChangeOK = qw/
143 alternate backwards level normalization rearrange
144 katakana_before_hiragana upper_before_lower
145 overrideHangul overrideCJK preprocess UCA_Version
91ae00cb 146 hangul_terminator variable
0116f5dc
JH
147 /;
148
149our @ChangeNG = qw/
b5d9a953 150 entry mapping table maxlength contraction
91ae00cb 151 ignoreChar ignoreName undefChar undefName variableTable
0116f5dc 152 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
00e00351 153 derivCode normCode rearrangeHash backwardsFlag
aa7758f7 154 suppress suppressHash
0116f5dc 155 /;
9f1f04a1
RGS
156# The hash key 'ignored' is deleted at v 0.21.
157# The hash key 'isShift' is deleted at v 0.23.
158# The hash key 'combining' is deleted at v 0.24.
91ae00cb 159# The hash key 'entries' is deleted at v 0.30.
e7f779c8 160# The hash key 'L3_ignorable' is deleted at v 0.40.
91ae00cb
NC
161
162sub version {
163 my $self = shift;
164 return $self->{versionTable} || 'unknown';
165}
0116f5dc
JH
166
167my (%ChangeOK, %ChangeNG);
168@ChangeOK{ @ChangeOK } = ();
169@ChangeNG{ @ChangeNG } = ();
170
171sub change {
172 my $self = shift;
173 my %hash = @_;
174 my %old;
91ae00cb
NC
175 if (exists $hash{variable} && exists $hash{alternate}) {
176 delete $hash{alternate};
177 }
178 elsif (!exists $hash{variable} && exists $hash{alternate}) {
179 $hash{variable} = $hash{alternate};
180 }
0116f5dc
JH
181 foreach my $k (keys %hash) {
182 if (exists $ChangeOK{$k}) {
183 $old{$k} = $self->{$k};
184 $self->{$k} = $hash{$k};
185 }
186 elsif (exists $ChangeNG{$k}) {
187 croak "change of $k via change() is not allowed!";
188 }
189 # else => ignored
190 }
3756e7ca 191 $self->checkCollator();
0116f5dc
JH
192 return wantarray ? %old : $self;
193}
a7fbee98 194
9f1f04a1
RGS
195sub _checkLevel {
196 my $level = shift;
abd1ec54
NC
197 my $key = shift; # 'level' or 'backwards'
198 MinLevel <= $level or croak sprintf
199 "Illegal level %d (in value for key '%s') lower than %d.",
200 $level, $key, MinLevel;
201 $level <= MaxLevel or croak sprintf
202 "Unsupported level %d (in value for key '%s') higher than %d.",
203 $level, $key, MaxLevel;
9f1f04a1
RGS
204}
205
91ae00cb
NC
206my %DerivCode = (
207 8 => \&_derivCE_8,
208 9 => \&_derivCE_9,
209 11 => \&_derivCE_9, # 11 == 9
3756e7ca 210 14 => \&_derivCE_14,
74b94a79
CBW
211 16 => \&_derivCE_14, # 16 == 14
212 18 => \&_derivCE_18,
00e00351 213 20 => \&_derivCE_20,
b5d9a953 214 22 => \&_derivCE_22,
91ae00cb
NC
215);
216
0116f5dc
JH
217sub checkCollator {
218 my $self = shift;
9f1f04a1 219 _checkLevel($self->{level}, "level");
a7fbee98 220
91ae00cb
NC
221 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
222 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
a7fbee98 223
91ae00cb 224 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
10d7ec48 225 $self->{alternateTable} || 'shifted';
91ae00cb
NC
226 $self->{variable} = $self->{alternate} = lc($self->{variable});
227 exists $VariableOK{ $self->{variable} }
3756e7ca 228 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
0116f5dc 229
9f1f04a1
RGS
230 if (! defined $self->{backwards}) {
231 $self->{backwardsFlag} = 0;
232 }
233 elsif (! ref $self->{backwards}) {
234 _checkLevel($self->{backwards}, "backwards");
235 $self->{backwardsFlag} = 1 << $self->{backwards};
236 }
237 else {
238 my %level;
239 $self->{backwardsFlag} = 0;
240 for my $b (@{ $self->{backwards} }) {
241 _checkLevel($b, "backwards");
242 $level{$b} = 1;
243 }
244 for my $v (sort keys %level) {
245 $self->{backwardsFlag} += 1 << $v;
246 }
247 }
0116f5dc 248
91ae00cb
NC
249 defined $self->{rearrange} or $self->{rearrange} = [];
250 ref $self->{rearrange}
251 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
0116f5dc
JH
252
253 # keys of $self->{rearrangeHash} are $self->{rearrange}.
254 $self->{rearrangeHash} = undef;
255
256 if (@{ $self->{rearrange} }) {
257 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
258 }
259
260 $self->{normCode} = undef;
a7fbee98
JH
261
262 if (defined $self->{normalization}) {
263 eval { require Unicode::Normalize };
91ae00cb 264 $@ and croak "Unicode::Normalize is required to normalize strings";
a7fbee98 265
91ae00cb 266 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
a7fbee98 267
91ae00cb
NC
268 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
269 $self->{normCode} = \&Unicode::Normalize::NFD;
270 }
271 elsif ($self->{normalization} ne 'prenormalized') {
06c8fc8f
RGS
272 my $norm = $self->{normalization};
273 $self->{normCode} = sub {
1d2654e1
JH
274 Unicode::Normalize::normalize($norm, shift);
275 };
06c8fc8f
RGS
276 eval { $self->{normCode}->("") }; # try
277 $@ and croak "$PACKAGE unknown normalization form name: $norm";
1d2654e1 278 }
a7fbee98 279 }
0116f5dc
JH
280 return;
281}
282
283sub new
284{
285 my $class = shift;
286 my $self = bless { @_ }, $class;
45394607 287
aa7758f7
CBW
288 # keys of $self->{suppressHash} are $self->{suppress}.
289 if ($self->{suppress} && @{ $self->{suppress} }) {
290 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
291 } # before read_table()
292
a7fbee98 293 # If undef is passed explicitly, no file is read.
0116f5dc 294 $self->{table} = $KeyFile if ! exists $self->{table};
3756e7ca 295 $self->read_table() if defined $self->{table};
905aa9f0 296
a7fbee98 297 if ($self->{entry}) {
e7f779c8
RGS
298 while ($self->{entry} =~ /([^\n]+)/g) {
299 $self->parseEntry($1);
300 }
a7fbee98 301 }
905aa9f0 302
9f1f04a1 303 $self->{level} ||= MaxLevel;
0116f5dc 304 $self->{UCA_Version} ||= UCA_Version();
905aa9f0 305
abd1ec54 306 $self->{overrideHangul} = FALSE
0116f5dc 307 if ! exists $self->{overrideHangul};
abd1ec54 308 $self->{overrideCJK} = FALSE
0116f5dc 309 if ! exists $self->{overrideCJK};
06c8fc8f 310 $self->{normalization} = 'NFD'
0116f5dc 311 if ! exists $self->{normalization};
3756e7ca
RGS
312 $self->{rearrange} = $self->{rearrangeTable} ||
313 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
a7fbee98 314 if ! exists $self->{rearrange};
0116f5dc
JH
315 $self->{backwards} = $self->{backwardsTable}
316 if ! exists $self->{backwards};
a7fbee98 317
3756e7ca 318 $self->checkCollator();
a7fbee98
JH
319
320 return $self;
321}
905aa9f0 322
00e00351
CBW
323sub parseAtmark {
324 my $self = shift;
325 my $line = shift; # after s/^\s*\@//
326
327 if ($line =~ /^version\s*(\S*)/) {
328 $self->{versionTable} ||= $1;
329 }
330 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
331 $self->{variableTable} ||= $1;
332 }
333 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
334 $self->{alternateTable} ||= $1;
335 }
336 elsif ($line =~ /^backwards\s+(\S*)/) {
337 push @{ $self->{backwardsTable} }, $1;
338 }
339 elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
340 push @{ $self->{forwardsTable} }, $1;
341 }
342 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
343 push @{ $self->{rearrangeTable} }, _getHexArray($1);
344 }
345}
346
905aa9f0 347sub read_table {
a7fbee98 348 my $self = shift;
a7fbee98 349
e7f779c8
RGS
350 my($f, $fh);
351 foreach my $d (@INC) {
352 $f = File::Spec->catfile($d, @Path, $self->{table});
353 last if open($fh, $f);
354 $f = undef;
355 }
0d50d293
RGS
356 if (!defined $f) {
357 $f = File::Spec->catfile(@Path, $self->{table});
358 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
359 }
a7fbee98 360
6d24ed10
SP
361 while (my $line = <$fh>) {
362 next if $line =~ /^\s*#/;
abd1ec54 363
00e00351
CBW
364 if ($line =~ s/^\s*\@//) {
365 $self->parseAtmark($line);
366 } else {
367 $self->parseEntry($line);
abd1ec54 368 }
45394607 369 }
e7f779c8 370 close $fh;
45394607
JH
371}
372
905aa9f0 373
45394607
JH
374##
375## get $line, parse it, and write an entry in $self
376##
377sub parseEntry
378{
a7fbee98
JH
379 my $self = shift;
380 my $line = shift;
4d36a948 381 my($name, $entry, @uv, @key);
a7fbee98
JH
382
383 return if $line !~ /^\s*[0-9A-Fa-f]/;
384
385 # removes comment and gets name
386 $name = $1
387 if $line =~ s/[#%]\s*(.*)//;
388 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
389
390 # gets element
391 my($e, $k) = split /;/, $line;
392 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
393 if ! $k;
394
4d36a948
ST
395 @uv = _getHexArray($e);
396 return if !@uv;
aa7758f7
CBW
397 return if @uv > 1 && $self->{suppressHash} &&
398 exists $self->{suppressHash}{$uv[0]};
4d36a948 399 $entry = join(CODE_SEP, @uv); # in JCPS
0116f5dc 400
4d36a948 401 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
9f1f04a1 402 my $ele = pack_U(@uv);
a7fbee98 403
4d36a948
ST
404 # regarded as if it were not entried in the table
405 return
406 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
caffd4cf 407
4d36a948
ST
408 # replaced as completely ignorable
409 $k = '[.0000.0000.0000.0000]'
410 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
45394607 411 }
0116f5dc 412
4d36a948
ST
413 # replaced as completely ignorable
414 $k = '[.0000.0000.0000.0000]'
415 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
416
4c843366 417 my $is_L3_ignorable = TRUE;
4d36a948 418
caffd4cf
ST
419 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
420 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
4d36a948
ST
421 my @wt = _getHexArray($arr);
422 push @key, pack(VCE_TEMPLATE, $var, @wt);
4c843366 423 $is_L3_ignorable = FALSE
3756e7ca
RGS
424 if $wt[0] || $wt[1] || $wt[2];
425 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
426 # is completely ignorable.
4c843366
JH
427 # For expansion, an entry $is_L3_ignorable
428 # if and only if "all" CEs are [.0000.0000.0000].
a7fbee98 429 }
caffd4cf 430
e7f779c8 431 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
caffd4cf 432
91ae00cb 433 if (@uv > 1) {
b5d9a953
CBW
434 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
435 $self->{maxlength}{$uv[0]} = @uv;
436 }
437 }
438 if (@uv > 2) {
439 while (@uv) {
440 pop @uv;
441 my $fake_entry = join(CODE_SEP, @uv); # in JCPS
442 $self->{contraction}{$fake_entry} = 1;
443 }
91ae00cb 444 }
45394607
JH
445}
446
9f1f04a1 447
45394607 448##
aa7758f7 449## VCE = _varCE(variable, VCE)
45394607 450##
abd1ec54 451sub _varCE
45394607 452{
abd1ec54
NC
453 my $vbl = shift;
454 my $vce = shift;
455 if ($vbl eq 'non-ignorable') {
456 return $vce;
457 }
458 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
459
460 if ($var) {
461 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
462 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
463 }
464 elsif ($vbl eq 'blanked') {
465 return $vce;
466 }
467 else {
468 return pack(VCE_TEMPLATE, $var, @wt[0..2],
469 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
470 }
45394607
JH
471}
472
45394607
JH
473sub viewSortKey
474{
a7fbee98 475 my $self = shift;
9f1f04a1
RGS
476 $self->visualizeSortKey($self->getSortKey(@_));
477}
0116f5dc 478
9f1f04a1
RGS
479sub visualizeSortKey
480{
481 my $self = shift;
482 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
4d36a948 483
9f1f04a1 484 if ($self->{UCA_Version} <= 8) {
0116f5dc
JH
485 $view =~ s/ ?0000 ?/|/g;
486 } else {
487 $view =~ s/\b0000\b/|/g;
488 }
a7fbee98 489 return "[$view]";
45394607
JH
490}
491
d16e9e3d 492
45394607 493##
91ae00cb
NC
494## arrayref of JCPS = splitEnt(string to be collated)
495## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
45394607 496##
91ae00cb 497sub splitEnt
45394607 498{
a7fbee98 499 my $self = shift;
4d36a948
ST
500 my $wLen = $_[1];
501
a7fbee98 502 my $code = $self->{preprocess};
0116f5dc 503 my $norm = $self->{normCode};
91ae00cb 504 my $map = $self->{mapping};
a7fbee98
JH
505 my $max = $self->{maxlength};
506 my $reH = $self->{rearrangeHash};
b5d9a953
CBW
507 my $vers = $self->{UCA_Version};
508 my $ver9 = $vers >= 9 && $vers <= 11;
a7fbee98 509
4d36a948 510 my ($str, @buf);
a7fbee98 511
4d36a948
ST
512 if ($wLen) {
513 $code and croak "Preprocess breaks character positions. "
514 . "Don't use with index(), match(), etc.";
515 $norm and croak "Normalization breaks character positions. "
516 . "Don't use with index(), match(), etc.";
517 $str = $_[0];
518 }
519 else {
520 $str = $_[0];
521 $str = &$code($str) if ref $code;
522 $str = &$norm($str) if ref $norm;
523 }
a7fbee98 524
4d36a948 525 # get array of Unicode code point of string.
9f1f04a1 526 my @src = unpack_U($str);
4d36a948
ST
527
528 # rearrangement:
529 # Character positions are not kept if rearranged,
530 # then neglected if $wLen is true.
531 if ($reH && ! $wLen) {
a7fbee98
JH
532 for (my $i = 0; $i < @src; $i++) {
533 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
534 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
535 $i++;
536 }
537 }
45394607 538 }
45394607 539
3756e7ca 540 # remove a code point marked as a completely ignorable.
abd1ec54 541 for (my $i = 0; $i < @src; $i++) {
b5d9a953
CBW
542 if (_isIllegal($src[$i]) || $vers <= 20 && _isNonchar($src[$i])) {
543 $src[$i] = undef;
544 } elsif ($ver9) {
545 $src[$i] = undef if $map->{ $src[$i] } &&
546 @{ $map->{ $src[$i] } } == 0;
547 }
0116f5dc
JH
548 }
549
a7fbee98 550 for (my $i = 0; $i < @src; $i++) {
91ae00cb 551 my $jcps = $src[$i];
3756e7ca
RGS
552
553 # skip removed code point
554 if (! defined $jcps) {
555 if ($wLen && @buf) {
556 $buf[-1][2] = $i + 1;
557 }
558 next;
559 }
560
abd1ec54 561 my $i_orig = $i;
4d36a948 562
3756e7ca
RGS
563 # find contraction
564 if ($max->{$jcps}) {
91ae00cb
NC
565 my $temp_jcps = $jcps;
566 my $jcpsLen = 1;
567 my $maxLen = $max->{$jcps};
4d36a948 568
91ae00cb 569 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
4d36a948 570 next if ! defined $src[$p];
91ae00cb
NC
571 $temp_jcps .= CODE_SEP . $src[$p];
572 $jcpsLen++;
573 if ($map->{$temp_jcps}) {
574 $jcps = $temp_jcps;
4d36a948
ST
575 $i = $p;
576 }
577 }
4d36a948 578
b5d9a953 579 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
06c8fc8f 580 # This process requires Unicode::Normalize.
91ae00cb 581 # If "normalization" is undef, here should be skipped *always*
06c8fc8f
RGS
582 # (in spite of bool value of $CVgetCombinClass),
583 # since canonical ordering cannot be expected.
584 # Blocked combining character should not be contracted.
585
06c8fc8f 586 # $self->{normCode} is false in the case of "prenormalized".
b5d9a953
CBW
587 if ($self->{normalization}) {
588 my $cont = $self->{contraction};
06c8fc8f 589 my $preCC = 0;
b5d9a953
CBW
590 my $preCC_uc = 0;
591 my $jcps_uc = $jcps;
592 my(@out, @out_uc);
06c8fc8f
RGS
593
594 for (my $p = $i + 1; $p < @src; $p++) {
595 next if ! defined $src[$p];
b5d9a953 596 my $curCC = $CVgetCombinClass->($src[$p]);
06c8fc8f
RGS
597 last unless $curCC;
598 my $tail = CODE_SEP . $src[$p];
b5d9a953
CBW
599
600 if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
601 $cont->{$jcps_uc.$tail})) {
602 $jcps_uc .= $tail;
603 push @out_uc, $p;
604 } else {
605 $preCC_uc = $curCC;
606 }
607
91ae00cb
NC
608 if ($preCC != $curCC && $map->{$jcps.$tail}) {
609 $jcps .= $tail;
b5d9a953 610 push @out, $p;
06c8fc8f
RGS
611 } else {
612 $preCC = $curCC;
613 }
4d36a948 614 }
b5d9a953
CBW
615
616 if ($map->{$jcps_uc}) {
617 $jcps = $jcps_uc;
618 $src[$_] = undef for @out_uc;
619 } else {
620 $src[$_] = undef for @out;
621 }
a7fbee98 622 }
a7fbee98
JH
623 }
624
3756e7ca
RGS
625 # skip completely ignorable
626 if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
627 if ($wLen && @buf) {
628 $buf[-1][2] = $i + 1;
a7fbee98 629 }
3756e7ca 630 next;
a7fbee98 631 }
4d36a948 632
91ae00cb 633 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
45394607 634 }
4d36a948 635 return \@buf;
d16e9e3d 636}
45394607 637
539ce3d8
CBW
638##
639## VCE = _pack_override(input, codepoint, derivCode)
640##
641sub _pack_override ($$$) {
642 my $r = shift;
643 my $u = shift;
644 my $der = shift;
645
646 if (ref $r) {
647 return pack(VCE_TEMPLATE, NON_VAR, @$r);
648 } elsif (defined $r) {
649 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
650 } else {
651 return $der->($u);
652 }
653}
d16e9e3d
JH
654
655##
abd1ec54 656## list of VCE = getWt(JCPS)
d16e9e3d
JH
657##
658sub getWt
659{
a7fbee98 660 my $self = shift;
91ae00cb 661 my $u = shift;
abd1ec54 662 my $vbl = $self->{variable};
91ae00cb 663 my $map = $self->{mapping};
0116f5dc 664 my $der = $self->{derivCode};
a7fbee98 665
91ae00cb 666 return if !defined $u;
abd1ec54 667 return map(_varCE($vbl, $_), @{ $map->{$u} })
91ae00cb 668 if $map->{$u};
a7fbee98 669
91ae00cb
NC
670 # JCPS must not be a contraction, then it's a code point.
671 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
1d2654e1
JH
672 my $hang = $self->{overrideHangul};
673 my @hangulCE;
674 if ($hang) {
539ce3d8
CBW
675 @hangulCE = map _pack_override($_, $u, $der), $hang->($u);
676 } elsif (!defined $hang) {
1d2654e1 677 @hangulCE = $der->($u);
539ce3d8 678 } else {
1d2654e1
JH
679 my $max = $self->{maxlength};
680 my @decH = _decompHangul($u);
681
682 if (@decH == 2) {
683 my $contract = join(CODE_SEP, @decH);
91ae00cb 684 @decH = ($contract) if $map->{$contract};
1d2654e1
JH
685 } else { # must be <@decH == 3>
686 if ($max->{$decH[0]}) {
687 my $contract = join(CODE_SEP, @decH);
91ae00cb 688 if ($map->{$contract}) {
1d2654e1
JH
689 @decH = ($contract);
690 } else {
691 $contract = join(CODE_SEP, @decH[0,1]);
91ae00cb 692 $map->{$contract} and @decH = ($contract, $decH[2]);
1d2654e1
JH
693 }
694 # even if V's ignorable, LT contraction is not supported.
695 # If such a situatution were required, NFD should be used.
696 }
697 if (@decH == 3 && $max->{$decH[1]}) {
698 my $contract = join(CODE_SEP, @decH[1,2]);
91ae00cb 699 $map->{$contract} and @decH = ($decH[0], $contract);
1d2654e1
JH
700 }
701 }
702
703 @hangulCE = map({
91ae00cb 704 $map->{$_} ? @{ $map->{$_} } : $der->($_);
1d2654e1
JH
705 } @decH);
706 }
abd1ec54 707 return map _varCE($vbl, $_), @hangulCE;
539ce3d8 708 } else {
584e761d
CBW
709 my $cjk = $self->{overrideCJK};
710 my $vers = $self->{UCA_Version};
711 if ($cjk && _isUIdeo($u, $vers)) {
712 my @cjkCE = map _pack_override($_, $u, $der), $cjk->($u);
713 return map _varCE($vbl, $_), @cjkCE;
714 }
715 if ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
716 return map _varCE($vbl, $_), _uideoCE_8($u);
717 }
abd1ec54 718 return map _varCE($vbl, $_), $der->($u);
a7fbee98 719 }
d16e9e3d
JH
720}
721
d16e9e3d
JH
722
723##
724## string sortkey = getSortKey(string arg)
725##
726sub getSortKey
727{
a7fbee98
JH
728 my $self = shift;
729 my $lev = $self->{level};
91ae00cb 730 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
00e00351 731 my $vers = $self->{UCA_Version};
aa7758f7
CBW
732 my $vbl = $self->{variable};
733 my $term = $self->{hangul_terminator};
734 my $v2i = $vers >= 9 && $vbl ne 'non-ignorable';
91ae00cb 735
abd1ec54 736 my @buf; # weight arrays
aa7758f7 737 if ($term) {
91ae00cb 738 my $preHST = '';
aa7758f7 739 my $termCE = _varCE($vbl, pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
91ae00cb
NC
740 foreach my $jcps (@$rEnt) {
741 # weird things like VL, TL-contraction are not considered!
aa7758f7 742 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
91ae00cb
NC
743 if ($preHST && !$curHST || # hangul before non-hangul
744 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
745 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
746 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
aa7758f7 747 push @buf, $termCE;
91ae00cb
NC
748 }
749 $preHST = $curHST;
abd1ec54 750 push @buf, $self->getWt($jcps);
91ae00cb 751 }
aa7758f7
CBW
752 push @buf, $termCE if $preHST; # end at hangul
753 } else {
91ae00cb 754 foreach my $jcps (@$rEnt) {
abd1ec54 755 push @buf, $self->getWt($jcps);
91ae00cb
NC
756 }
757 }
758
abd1ec54
NC
759 # make sort key
760 my @ret = ([],[],[],[]);
761 my $last_is_variable;
762
763 foreach my $vwt (@buf) {
764 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
3756e7ca
RGS
765
766 # "Ignorable (L1, L2) after Variable" since track. v. 9
abd1ec54
NC
767 if ($v2i) {
768 if ($var) {
769 $last_is_variable = TRUE;
aa7758f7 770 } elsif (!$wt[0]) { # ignorable
4d36a948 771 next if $last_is_variable;
aa7758f7 772 } else {
abd1ec54 773 $last_is_variable = FALSE;
0116f5dc
JH
774 }
775 }
abd1ec54
NC
776 foreach my $v (0..$lev-1) {
777 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
a7fbee98
JH
778 }
779 }
45394607 780
a7fbee98
JH
781 # modification of tertiary weights
782 if ($self->{upper_before_lower}) {
6d24ed10
SP
783 foreach my $w (@{ $ret[2] }) {
784 if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
785 elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
786 elsif ($w == 0x1C) { $w += 1 } # square upper
787 elsif ($w == 0x1D) { $w -= 1 } # square lower
a7fbee98 788 }
45394607 789 }
a7fbee98 790 if ($self->{katakana_before_hiragana}) {
6d24ed10
SP
791 foreach my $w (@{ $ret[2] }) {
792 if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
793 elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
a7fbee98 794 }
45394607 795 }
9f1f04a1
RGS
796
797 if ($self->{backwardsFlag}) {
798 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
799 if ($self->{backwardsFlag} & (1 << $v)) {
800 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
801 }
802 }
803 }
804
4d36a948 805 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
45394607
JH
806}
807
808
809##
d16e9e3d 810## int compare = cmp(string a, string b)
45394607 811##
5398038e 812sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
813sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
814sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
815sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
816sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
817sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
818sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
45394607
JH
819
820##
d16e9e3d 821## list[strings] sorted = sort(list[strings] arg)
45394607 822##
a7fbee98
JH
823sub sort {
824 my $obj = shift;
825 return
826 map { $_->[1] }
827 sort{ $a->[0] cmp $b->[0] }
828 map [ $obj->getSortKey($_), $_ ], @_;
45394607
JH
829}
830
0116f5dc 831
b5d9a953
CBW
832sub _derivCE_22 {
833 my $u = shift;
834 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u})
835 ? 0xFB40 : # CJK
836 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
837 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
838 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin ||
839 CJK_ExtDIni <= $u && $u <= CJK_ExtDFin)
840 ? 0xFB80 # CJK ext.
841 : 0xFBC0; # others
842 my $aaaa = $base + ($u >> 15);
843 my $bbbb = ($u & 0x7FFF) | 0x8000;
844 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
845 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
846}
847
00e00351
CBW
848sub _derivCE_20 {
849 my $u = shift;
584e761d
CBW
850 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u})
851 ? 0xFB40 : # CJK
00e00351
CBW
852 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
853 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
584e761d
CBW
854 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
855 ? 0xFB80 # CJK ext.
856 : 0xFBC0; # others
00e00351
CBW
857 my $aaaa = $base + ($u >> 15);
858 my $bbbb = ($u & 0x7FFF) | 0x8000;
859 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
860 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
861}
862
74b94a79 863sub _derivCE_18 {
3756e7ca 864 my $u = shift;
584e761d
CBW
865 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF51 || $CompatUI{$u})
866 ? 0xFB40 : # CJK
74b94a79 867 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
584e761d
CBW
868 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
869 ? 0xFB80 # CJK ext.
870 : 0xFBC0; # others
74b94a79
CBW
871 my $aaaa = $base + ($u >> 15);
872 my $bbbb = ($u & 0x7FFF) | 0x8000;
873 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
874 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
875}
3756e7ca 876
74b94a79
CBW
877sub _derivCE_14 {
878 my $u = shift;
584e761d
CBW
879 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF41 || $CompatUI{$u})
880 ? 0xFB40 : # CJK
74b94a79 881 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
584e761d
CBW
882 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
883 ? 0xFB80 # CJK ext.
884 : 0xFBC0; # others
3756e7ca
RGS
885 my $aaaa = $base + ($u >> 15);
886 my $bbbb = ($u & 0x7FFF) | 0x8000;
74b94a79
CBW
887 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
888 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
3756e7ca
RGS
889}
890
4d36a948 891sub _derivCE_9 {
0116f5dc 892 my $u = shift;
584e761d
CBW
893 my $base = (CJK_UidIni <= $u && $u <= CJK_UidFin || $CompatUI{$u})
894 ? 0xFB40 : # CJK
74b94a79 895 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
584e761d
CBW
896 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
897 ? 0xFB80 # CJK ext.
898 : 0xFBC0; # others
0116f5dc
JH
899 my $aaaa = $base + ($u >> 15);
900 my $bbbb = ($u & 0x7FFF) | 0x8000;
74b94a79
CBW
901 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
902 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
0116f5dc
JH
903}
904
4d36a948 905sub _derivCE_8 {
0116f5dc
JH
906 my $code = shift;
907 my $aaaa = 0xFF80 + ($code >> 15);
908 my $bbbb = ($code & 0x7FFF) | 0x8000;
584e761d
CBW
909 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
910 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
45394607
JH
911}
912
3756e7ca
RGS
913sub _uideoCE_8 {
914 my $u = shift;
915 return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
916}
917
918sub _isUIdeo {
584e761d 919 # $uca_vers = 0 for _uideoCE_8()
3756e7ca 920 my ($u, $uca_vers) = @_;
74b94a79 921 return((CJK_UidIni <= $u && (
00e00351 922 $uca_vers >= 20 ? ($u <= CJK_UidF52) :
74b94a79
CBW
923 $uca_vers >= 18 ? ($u <= CJK_UidF51) :
924 $uca_vers >= 14 ? ($u <= CJK_UidF41) :
584e761d 925 ($u <= CJK_UidFin))) || $CompatUI{$u}
3756e7ca
RGS
926 ||
927 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
928 ||
584e761d 929 ($uca_vers >= 8 && CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
00e00351 930 ||
539ce3d8 931 ($uca_vers >= 20 && CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
b5d9a953
CBW
932 ||
933 ($uca_vers >= 22 && CJK_ExtDIni <= $u && $u <= CJK_ExtDFin)
3756e7ca
RGS
934 );
935}
936
91ae00cb 937
45394607
JH
938##
939## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
940##
a7fbee98 941sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
5398038e 942
a7fbee98 943#
4d36a948 944# $code *must* be in Hangul syllable.
a7fbee98
JH
945# Check it before you enter here.
946#
5398038e 947sub _decompHangul {
948 my $code = shift;
3756e7ca
RGS
949 my $si = $code - Hangul_SBase;
950 my $li = int( $si / Hangul_NCount);
951 my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
952 my $ti = $si % Hangul_TCount;
5398038e 953 return (
3756e7ca
RGS
954 Hangul_LBase + $li,
955 Hangul_VBase + $vi,
956 $ti ? (Hangul_TBase + $ti) : (),
5398038e 957 );
45394607
JH
958}
959
10d7ec48 960sub _isIllegal {
4d36a948 961 my $code = shift;
b5d9a953 962 return((! defined $code) # removed
4d36a948 963 || ($code < 0 || 0x10FFFF < $code) # out of range
b5d9a953
CBW
964 );
965}
966
967sub _isNonchar {
968 my $code = shift;
969 return((($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
4d36a948
ST
970 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
971 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
b5d9a953 972 );
4d36a948
ST
973}
974
91ae00cb
NC
975# Hangul Syllable Type
976sub getHST {
977 my $u = shift;
00e00351
CBW
978 my $vers = shift || 0;
979
980 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
981 return +($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV";
982 }
983
984 if ($vers < 20) {
985 return Hangul_LIni <= $u && $u <= Hangul_LFin ||
986 $u == Hangul_LFill ? "L" :
987 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
988 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" : "";
989 } else {
990 return Hangul_LIni <= $u && $u <= Hangul_LEnd ||
991 HangulL2Ini <= $u && $u <= HangulL2Fin ? "L" :
992 Hangul_VIni <= $u && $u <= Hangul_VEnd ||
993 HangulV2Ini <= $u && $u <= HangulV2Fin ? "V" :
994 Hangul_TIni <= $u && $u <= Hangul_TEnd ||
995 HangulT2Ini <= $u && $u <= HangulT2Fin ? "T" : "";
996 }
91ae00cb
NC
997}
998
4d36a948
ST
999
1000##
1001## bool _nonIgnorAtLevel(arrayref weights, int level)
1002##
1003sub _nonIgnorAtLevel($$)
1004{
1005 my $wt = shift;
1006 return if ! defined $wt;
1007 my $lv = shift;
9f1f04a1 1008 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
4d36a948
ST
1009}
1010
1011##
1012## bool _eqArray(
1013## arrayref of arrayref[weights] source,
1014## arrayref of arrayref[weights] substr,
1015## int level)
1016## * comparison of graphemes vs graphemes.
1017## @$source >= @$substr must be true (check it before call this);
1018##
1019sub _eqArray($$$)
1020{
1021 my $source = shift;
1022 my $substr = shift;
1023 my $lev = shift;
1024
1025 for my $g (0..@$substr-1){
1026 # Do the $g'th graphemes have the same number of AV weigths?
1027 return if @{ $source->[$g] } != @{ $substr->[$g] };
1028
1029 for my $w (0..@{ $substr->[$g] }-1) {
1030 for my $v (0..$lev-1) {
1031 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
1032 }
1033 }
1034 }
1035 return 1;
1036}
1037
1038##
1039## (int position, int length)
1040## int position = index(string, substring, position, [undoc'ed grobal])
1041##
1042## With "grobal" (only for the list context),
1043## returns list of arrayref[position, length].
1044##
1045sub index
1046{
91ae00cb
NC
1047 my $self = shift;
1048 my $str = shift;
1049 my $len = length($str);
1050 my $subE = $self->splitEnt(shift);
1051 my $pos = @_ ? shift : 0;
1052 $pos = 0 if $pos < 0;
1053 my $grob = shift;
1054
1055 my $lev = $self->{level};
3756e7ca
RGS
1056 my $v2i = $self->{UCA_Version} >= 9 &&
1057 $self->{variable} ne 'non-ignorable';
91ae00cb
NC
1058
1059 if (! @$subE) {
4d36a948
ST
1060 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
1061 return $grob
1062 ? map([$_, 0], $temp..$len)
1063 : wantarray ? ($temp,0) : $temp;
1064 }
abd1ec54
NC
1065 $len < $pos
1066 and return wantarray ? () : NOMATCHPOS;
91ae00cb 1067 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
abd1ec54
NC
1068 @$strE
1069 or return wantarray ? () : NOMATCHPOS;
1070
4d36a948
ST
1071 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
1072
abd1ec54
NC
1073 my $last_is_variable;
1074 for my $vwt (map $self->getWt($_), @$subE) {
1075 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
1076 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 1077
3756e7ca
RGS
1078 # "Ignorable (L1, L2) after Variable" since track. v. 9
1079 if ($v2i) {
abd1ec54
NC
1080 if ($var) {
1081 $last_is_variable = TRUE;
1082 }
1083 elsif (!$wt[0]) { # ignorable
4d36a948 1084 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
1085 }
1086 else {
1087 $last_is_variable = FALSE;
4d36a948
ST
1088 }
1089 }
1090
abd1ec54
NC
1091 if (@subWt && !$var && !$wt[0]) {
1092 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
1393fe00 1093 } elsif ($to_be_pushed) {
abd1ec54 1094 push @subWt, [ \@wt ];
4d36a948 1095 }
1393fe00 1096 # else ===> skipped
4d36a948
ST
1097 }
1098
1099 my $count = 0;
91ae00cb 1100 my $end = @$strE - 1;
4d36a948 1101
abd1ec54 1102 $last_is_variable = FALSE; # reuse
4d36a948
ST
1103 for (my $i = 0; $i <= $end; ) { # no $i++
1104 my $found_base = 0;
1105
1106 # fetch a grapheme
1107 while ($i <= $end && $found_base == 0) {
abd1ec54
NC
1108 for my $vwt ($self->getWt($strE->[$i][0])) {
1109 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
1110 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
4d36a948 1111
3756e7ca
RGS
1112 # "Ignorable (L1, L2) after Variable" since track. v. 9
1113 if ($v2i) {
abd1ec54
NC
1114 if ($var) {
1115 $last_is_variable = TRUE;
1116 }
1117 elsif (!$wt[0]) { # ignorable
4d36a948 1118 $to_be_pushed = FALSE if $last_is_variable;
abd1ec54
NC
1119 }
1120 else {
1121 $last_is_variable = FALSE;
4d36a948
ST
1122 }
1123 }
1124
abd1ec54
NC
1125 if (@strWt && !$var && !$wt[0]) {
1126 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
91ae00cb 1127 $finPos[-1] = $strE->[$i][2];
4d36a948 1128 } elsif ($to_be_pushed) {
abd1ec54 1129 push @strWt, [ \@wt ];
91ae00cb 1130 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
4d36a948 1131 $finPos[-1] = NOMATCHPOS if $found_base;
91ae00cb 1132 push @finPos, $strE->[$i][2];
4d36a948
ST
1133 $found_base++;
1134 }
1135 # else ===> no-op
1136 }
1137 $i++;
1138 }
1139
1140 # try to match
1141 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1142 if ($iniPos[0] != NOMATCHPOS &&
1143 $finPos[$#subWt] != NOMATCHPOS &&
1144 _eqArray(\@strWt, \@subWt, $lev)) {
1145 my $temp = $iniPos[0] + $pos;
1146
1147 if ($grob) {
1148 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1149 splice @strWt, 0, $#subWt;
1150 splice @iniPos, 0, $#subWt;
1151 splice @finPos, 0, $#subWt;
1152 }
1153 else {
1154 return wantarray
1155 ? ($temp, $finPos[$#subWt] - $iniPos[0])
1156 : $temp;
1157 }
1158 }
1159 shift @strWt;
1160 shift @iniPos;
1161 shift @finPos;
1162 }
1163 }
1164
1165 return $grob
1166 ? @g_ret
1167 : wantarray ? () : NOMATCHPOS;
1168}
1169
1170##
1171## scalarref to matching part = match(string, substring)
1172##
1173sub match
1174{
1175 my $self = shift;
1176 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1177 my $temp = substr($_[0], $pos, $len);
1178 return wantarray ? $temp : \$temp;
1179 # An lvalue ref \substr should be avoided,
1180 # since its value is affected by modification of its referent.
1181 }
1182 else {
1183 return;
1184 }
1185}
1186
1187##
1188## arrayref matching parts = gmatch(string, substring)
1189##
1190sub gmatch
1191{
1192 my $self = shift;
1193 my $str = shift;
1194 my $sub = shift;
1195 return map substr($str, $_->[0], $_->[1]),
1196 $self->index($str, $sub, 0, 'g');
1197}
1198
1199##
1200## bool subst'ed = subst(string, substring, replace)
1201##
1202sub subst
1203{
1204 my $self = shift;
1205 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1206
1207 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1208 if ($code) {
1209 my $mat = substr($_[0], $pos, $len);
1210 substr($_[0], $pos, $len, $code->($mat));
1211 } else {
1212 substr($_[0], $pos, $len, $_[2]);
1213 }
1214 return TRUE;
1215 }
1216 else {
1217 return FALSE;
1218 }
1219}
1220
1221##
1222## int count = gsubst(string, substring, replace)
1223##
1224sub gsubst
1225{
1226 my $self = shift;
1227 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1228 my $cnt = 0;
1229
1230 # Replacement is carried out from the end, then use reverse.
1231 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1232 if ($code) {
1233 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1234 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1235 } else {
1236 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1237 }
1238 $cnt++;
1239 }
1240 return $cnt;
1241}
1242
45394607
JH
12431;
1244__END__
1245
1246=head1 NAME
1247
a7fbee98 1248Unicode::Collate - Unicode Collation Algorithm
45394607
JH
1249
1250=head1 SYNOPSIS
1251
1252 use Unicode::Collate;
1253
1254 #construct
5398038e 1255 $Collator = Unicode::Collate->new(%tailoring);
45394607
JH
1256
1257 #sort
5398038e 1258 @sorted = $Collator->sort(@not_sorted);
45394607
JH
1259
1260 #compare
a7fbee98 1261 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
45394607 1262
539ce3d8
CBW
1263B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1264according to Perl's Unicode support. See L<perlunicode>,
1265L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1266Otherwise you can use C<preprocess> or should decode them before.
91ae00cb 1267
45394607
JH
1268=head1 DESCRIPTION
1269
3756e7ca
RGS
1270This module is an implementation of Unicode Technical Standard #10
1271(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
4d36a948 1272
45394607
JH
1273=head2 Constructor and Tailoring
1274
539ce3d8
CBW
1275The C<new> method returns a collator object. If new() is called
1276with no parameters, the collator should do the default collation.
d16e9e3d 1277
5398038e 1278 $Collator = Unicode::Collate->new(
0116f5dc 1279 UCA_Version => $UCA_Version,
aa7758f7 1280 alternate => $alternate, # alias for 'variable'
45394607
JH
1281 backwards => $levelNumber, # or \@levelNumbers
1282 entry => $element,
91ae00cb 1283 hangul_terminator => $term_primary_weight,
45394607
JH
1284 ignoreName => qr/$ignoreName/,
1285 ignoreChar => qr/$ignoreChar/,
1286 katakana_before_hiragana => $bool,
1287 level => $collationLevel,
91ae00cb 1288 normalization => $normalization_form,
45394607
JH
1289 overrideCJK => \&overrideCJK,
1290 overrideHangul => \&overrideHangul,
1291 preprocess => \&preprocess,
1292 rearrange => \@charList,
aa7758f7 1293 suppress => \@charList,
45394607
JH
1294 table => $filename,
1295 undefName => qr/$undefName/,
1296 undefChar => qr/$undefChar/,
1297 upper_before_lower => $bool,
91ae00cb 1298 variable => $variable,
45394607 1299 );
45394607
JH
1300
1301=over 4
1302
0116f5dc
JH
1303=item UCA_Version
1304
3756e7ca
RGS
1305If the tracking version number of UCA is given,
1306behavior of that tracking version is emulated on collating.
0116f5dc 1307If omitted, the return value of C<UCA_Version()> is used.
3756e7ca 1308
b5d9a953 1309The following tracking versions are supported. The default is 20.
3756e7ca 1310
0d50d293 1311 UCA Unicode Standard DUCET (@version)
b5d9a953 1312 -------------------------------------------------------
0d50d293
RGS
1313 8 3.1 3.0.1 (3.0.1d9)
1314 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1315 11 4.0 4.0.0 (4.0.0)
1316 14 4.1.0 4.1.0 (4.1.0)
74b94a79
CBW
1317 16 5.0 5.0.0 (5.0.0)
1318 18 5.1.0 5.1.0 (5.1.0)
00e00351 1319 20 5.2.0 5.2.0 (5.2.0)
b5d9a953 1320 22 6.0.0 6.0.0 (6.0.0)
3756e7ca
RGS
1321
1322Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1323
b5d9a953
CBW
1324* Noncharacters (e.g. U+FFFF) are not ignored, and can be overrided
1325since C<UCA_Version> 22.
1326
1327* Fully ignorable characters were ignored, and would not interrupt
1328contractions with C<UCA_Version> 9 and 11.
1329
1330* Treatment of ignorables after variables and some behaviors
1331were changed at C<UCA_Version> 9.
1332
1333* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1334depend on C<UCA_Version>.
1335
1336* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1337C<hangul_terminator>.
1338
3756e7ca 1339=item alternate
0116f5dc 1340
3756e7ca 1341-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
0116f5dc 1342
3756e7ca
RGS
1343For backward compatibility, C<alternate> (old name) can be used
1344as an alias for C<variable>.
0116f5dc 1345
45394607
JH
1346=item backwards
1347
4d36a948 1348-- see 3.1.2 French Accents, UTS #10.
45394607
JH
1349
1350 backwards => $levelNumber or \@levelNumbers
1351
1352Weights in reverse order; ex. level 2 (diacritic ordering) in French.
68adb2b0
CBW
1353If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1354forwards at all the levels.
45394607
JH
1355
1356=item entry
1357
4d36a948 1358-- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
45394607 1359
91ae00cb
NC
1360If the same character (or a sequence of characters) exists
1361in the collation element table through C<table>,
1362mapping to collation elements is overrided.
1363If it does not exist, the mapping is defined additionally.
45394607 1364
abd1ec54
NC
1365 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
13660063 0068 ; [.0E6A.0020.0002.0063] # ch
13670043 0068 ; [.0E6A.0020.0007.0043] # Ch
13680043 0048 ; [.0E6A.0020.0008.0043] # CH
1369006C 006C ; [.0F4C.0020.0002.006C] # ll
1370004C 006C ; [.0F4C.0020.0007.004C] # Ll
1371004C 004C ; [.0F4C.0020.0008.004C] # LL
e7f779c8
RGS
137200F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1373006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
137400D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1375004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
abd1ec54
NC
1376ENTRY
1377
1378 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
137900E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
138000C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1381ENTRY
45394607 1382
4d36a948 1383B<NOTE:> The code point in the UCA file format (before C<';'>)
abd1ec54
NC
1384B<must> be a Unicode code point (defined as hexadecimal),
1385but not a native code point.
4d36a948
ST
1386So C<0063> must always denote C<U+0063>,
1387but not a character of C<"\x63">.
1388
abd1ec54
NC
1389Weighting may vary depending on collation element table.
1390So ensure the weights defined in C<entry> will be consistent with
1391those in the collation element table loaded via C<table>.
1392
1393In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1394and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1395(as a value between C<0E60> and C<0E6D>)
1396makes ordering as C<C E<lt> CH E<lt> D>.
1397Exactly speaking DUCET already has some characters between C<C> and C<D>:
1398C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1399C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1400and C<c-curl> (C<U+0255>) with C<0E69>.
1401Then primary weight C<0E6A> for C<CH> makes C<CH>
1402ordered between C<c-curl> and C<D>.
1403
91ae00cb
NC
1404=item hangul_terminator
1405
3756e7ca 1406-- see 7.1.4 Trailing Weights, UTS #10.
91ae00cb
NC
1407
1408If a true value is given (non-zero but should be positive),
1409it will be added as a terminator primary weight to the end of
1410every standard Hangul syllable. Secondary and any higher weights
1411for terminator are set to zero.
1412If the value is false or C<hangul_terminator> key does not exist,
1413insertion of terminator weights will not be performed.
1414
1415Boundaries of Hangul syllables are determined
1416according to conjoining Jamo behavior in F<the Unicode Standard>
1417and F<HangulSyllableType.txt>.
1418
1419B<Implementation Note:>
1420(1) For expansion mapping (Unicode character mapped
1421to a sequence of collation elements), a terminator will not be added
1422between collation elements, even if Hangul syllable boundary exists there.
1423Addition of terminator is restricted to the next position
1424to the last collation element.
1425
1426(2) Non-conjoining Hangul letters
1427(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1428automatically terminated with a terminator primary weight.
1429These characters may need terminator included in a collation element
1430table beforehand.
1431
45394607
JH
1432=item ignoreChar
1433
3756e7ca
RGS
1434=item ignoreName
1435
1436-- see 3.2.2 Variable Weighting, UTS #10.
45394607 1437
caffd4cf
ST
1438Makes the entry in the table completely ignorable;
1439i.e. as if the weights were zero at all level.
45394607 1440
3756e7ca
RGS
1441Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1442will be ignored. Through C<ignoreName>, any character whose name
1443(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1444will be ignored.
1445
a7fbee98 1446E.g. when 'a' and 'e' are ignorable,
45394607
JH
1447'element' is equal to 'lament' (or 'lmnt').
1448
3756e7ca
RGS
1449=item katakana_before_hiragana
1450
1451-- see 7.3.1 Tertiary Weight Table, UTS #10.
1452
1453By default, hiragana is before katakana.
1454If the parameter is made true, this is reversed.
1455
1456B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1457distinctions must occur in level 3, and their weights at level 3 must be
1458same as those mentioned in 7.3.1, UTS #10.
1459If you define your collation elements which violate this requirement,
1460this parameter does not work validly.
1461
45394607
JH
1462=item level
1463
3756e7ca 1464-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1465
1466Set the maximum level.
1467Any higher levels than the specified one are ignored.
1468
1469 Level 1: alphabetic ordering
1470 Level 2: diacritic ordering
1471 Level 3: case ordering
91ae00cb 1472 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
45394607
JH
1473
1474 ex.level => 2,
1475
a7fbee98
JH
1476If omitted, the maximum is the 4th.
1477
45394607
JH
1478=item normalization
1479
3756e7ca 1480-- see 4.1 Normalize, UTS #10.
45394607 1481
905aa9f0 1482If specified, strings are normalized before preparation of sort keys
45394607
JH
1483(the normalization is executed after preprocess).
1484
1d2654e1
JH
1485A form name C<Unicode::Normalize::normalize()> accepts will be applied
1486as C<$normalization_form>.
06c8fc8f 1487Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1d2654e1
JH
1488See C<Unicode::Normalize::normalize()> for detail.
1489If omitted, C<'NFD'> is used.
45394607 1490
91ae00cb 1491C<normalization> is performed after C<preprocess> (if defined).
45394607 1492
06c8fc8f
RGS
1493Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1494though they are not concerned with C<Unicode::Normalize::normalize()>.
1495
1496If C<undef> (not a string C<"undef">) is passed explicitly
1497as the value for this key,
45394607 1498any normalization is not carried out (this may make tailoring easier
abd1ec54
NC
1499if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1500only contiguous contractions are resolved;
1501e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1502C<A-cedilla-ring> would be primary equal to C<A>.
06c8fc8f
RGS
1503In this point,
1504C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1505B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1506
1507In the case of C<(normalization =E<gt> "prenormalized")>,
1508any normalization is not performed, but
b5d9a953 1509discontiguous contractions with combining characters are performed.
06c8fc8f
RGS
1510Therefore
1511C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1512B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1513If source strings are finely prenormalized,
1514C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1515
1516Except C<(normalization =E<gt> undef)>,
1517B<Unicode::Normalize> is required (see also B<CAVEAT>).
45394607
JH
1518
1519=item overrideCJK
1520
4d36a948 1521-- see 7.1 Derived Collation Elements, UTS #10.
45394607 1522
584e761d
CBW
1523By default, CJK unified ideographs are ordered in Unicode codepoint
1524order, but those in the CJK Unified Ideographs block are lesser than
1525those in the CJK Unified Ideographs Extension A etc.
00e00351 1526
b5d9a953
CBW
1527 In the CJK Unified Ideographs block:
1528 U+4E00..U+9FA5 if UCA_Version is 8 to 11.
1529 U+4E00..U+9FBB if UCA_Version is 14 to 16.
1530 U+4E00..U+9FC3 if UCA_Version is 18.
1531 U+4E00..U+9FCB if UCA_Version is 20 or greater.
00e00351 1532
b5d9a953
CBW
1533 In the CJK Unified Ideographs Extension blocks:
1534 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
1535 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or greater.
1536 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or greater.
91ae00cb 1537
584e761d
CBW
1538Through C<overrideCJK>, ordering of CJK unified ideographs (including
1539extensions) can be overrided.
45394607 1540
584e761d 1541ex. CJK unified ideographs in the JIS code point order.
45394607
JH
1542
1543 overrideCJK => sub {
a7fbee98
JH
1544 my $u = shift; # get a Unicode codepoint
1545 my $b = pack('n', $u); # to UTF-16BE
1546 my $s = your_unicode_to_sjis_converter($b); # convert
1547 my $n = unpack('n', $s); # convert sjis to short
1548 [ $n, 0x20, 0x2, $u ]; # return the collation element
45394607
JH
1549 },
1550
539ce3d8
CBW
1551The return value may be an arrayref of 1st to 4th weights as shown
1552above. The return value may be an integer as the primary weight
1553as shown below. If C<undef> is returned, the default derived
1554collation element will be used.
1555
1556 overrideCJK => sub {
1557 my $u = shift; # get a Unicode codepoint
1558 my $b = pack('n', $u); # to UTF-16BE
1559 my $s = your_unicode_to_sjis_converter($b); # convert
1560 my $n = unpack('n', $s); # convert sjis to short
1561 return $n; # return the primary weight
1562 },
1563
1564The return value may be a list containing zero or more of
1565an arrayref, an integer, or C<undef>.
1566
584e761d 1567ex. ignores all CJK unified ideographs.
a7fbee98
JH
1568
1569 overrideCJK => sub {()}, # CODEREF returning empty list
1570
1571 # where ->eq("Pe\x{4E00}rl", "Perl") is true
584e761d 1572 # as U+4E00 is a CJK unified ideograph and to be ignorable.
a7fbee98
JH
1573
1574If C<undef> is passed explicitly as the value for this key,
584e761d
CBW
1575weights for CJK unified ideographs are treated as undefined.
1576But assignment of weight for CJK unified ideographs
1577in C<table> or C<entry> is still valid.
1578
1579B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1580C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1581C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1582ideographs. But they can't be overrided via C<overrideCJK> when you use
1583DUCET, as the table includes weights for them. C<table> or C<entry> has
1584priority over C<overrideCJK>.
a7fbee98
JH
1585
1586=item overrideHangul
1587
4d36a948 1588-- see 7.1 Derived Collation Elements, UTS #10.
a7fbee98 1589
584e761d 1590By default, Hangul syllables are decomposed into Hangul Jamo,
abd1ec54 1591even if C<(normalization =E<gt> undef)>.
584e761d 1592But the mapping of Hangul syllables may be overrided.
a7fbee98 1593
3756e7ca 1594This parameter works like C<overrideCJK>, so see there for examples.
a7fbee98 1595
584e761d
CBW
1596If you want to override the mapping of Hangul syllables,
1597NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1598Hangul syllables before overriding. FCD may decompose Hangul syllables
1599as the case may be.
45394607 1600
a7fbee98 1601If C<undef> is passed explicitly as the value for this key,
584e761d 1602weight for Hangul syllables is treated as undefined
a7fbee98 1603without decomposition into Hangul Jamo.
584e761d
CBW
1604But definition of weight for Hangul syllables
1605in C<table> or C<entry> is still valid.
a7fbee98 1606
45394607
JH
1607=item preprocess
1608
4d36a948 1609-- see 5.1 Preprocessing, UTS #10.
45394607
JH
1610
1611If specified, the coderef is used to preprocess
1612before the formation of sort keys.
1613
a7fbee98 1614ex. dropping English articles, such as "a" or "the".
45394607
JH
1615Then, "the pen" is before "a pencil".
1616
1617 preprocess => sub {
1618 my $str = shift;
a7fbee98 1619 $str =~ s/\b(?:an?|the)\s+//gi;
1d2654e1 1620 return $str;
45394607
JH
1621 },
1622
91ae00cb 1623C<preprocess> is performed before C<normalization> (if defined).
1d2654e1 1624
539ce3d8
CBW
1625ex. decoding strings in a legacy encoding such as shift-jis:
1626
1627 $sjis_collator = Unicode::Collate->new(
1628 preprocess => \&your_shiftjis_to_unicode_decoder,
1629 );
1630 @result = $sjis_collator->sort(@shiftjis_strings);
1631
1632B<Note:> Strings returned from the coderef will be interpreted
1633according to Perl's Unicode support. See L<perlunicode>,
1634L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1635
45394607
JH
1636=item rearrange
1637
4d36a948 1638-- see 3.1.3 Rearrangement, UTS #10.
45394607
JH
1639
1640Characters that are not coded in logical order and to be rearranged.
3756e7ca 1641If C<UCA_Version> is equal to or lesser than 11, default is:
45394607
JH
1642
1643 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1644
3756e7ca
RGS
1645If you want to disallow any rearrangement, pass C<undef> or C<[]>
1646(a reference to empty list) as the value for this key.
1647
74b94a79
CBW
1648If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1649(i.e. no rearrangement).
a7fbee98 1650
0116f5dc
JH
1651B<According to the version 9 of UCA, this parameter shall not be used;
1652but it is not warned at present.>
1653
aa7758f7
CBW
1654=item suppress
1655
1656-- see suppress contractions in 5.14.11 Special-Purpose Commands,
1657UTS #35 (LDML).
1658
539ce3d8 1659Contractions beginning with the specified characters are suppressed,
584e761d 1660even if those contractions are defined in C<table> or C<entry>.
aa7758f7
CBW
1661
1662An example for Russian and some languages using the Cyrillic script:
1663
1664 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1665
1666where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1667
45394607
JH
1668=item table
1669
4d36a948 1670-- see 3.2 Default Unicode Collation Element Table, UTS #10.
45394607 1671
91ae00cb 1672You can use another collation element table if desired.
45394607 1673
e7f779c8 1674The table file should locate in the F<Unicode/Collate> directory
3756e7ca
RGS
1675on C<@INC>. Say, if the filename is F<Foo.txt>,
1676the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
e7f779c8
RGS
1677
1678By default, F<allkeys.txt> (as the filename of DUCET) is used.
0d50d293
RGS
1679If you will prepare your own table file, any name other than F<allkeys.txt>
1680may be better to avoid namespace conflict.
45394607 1681
00e00351
CBW
1682B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1683module, and it may save time at the run time.
1684Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
1685or using C<ignoreChar>, C<ignoreName>, C<undefChar>, or C<undefName>
1393fe00 1686will prevent this module from using the compiled DUCET.
00e00351 1687
a7fbee98 1688If C<undef> is passed explicitly as the value for this key,
91ae00cb 1689no file is read (but you can define collation elements via C<entry>).
a7fbee98
JH
1690
1691A typical way to define a collation element table
1692without any file of table:
1693
1694 $onlyABC = Unicode::Collate->new(
1695 table => undef,
1696 entry => << 'ENTRIES',
16970061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
16980041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
16990062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
17000042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
17010063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
17020043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1703ENTRIES
1704 );
905aa9f0 1705
3756e7ca
RGS
1706If C<ignoreName> or C<undefName> is used, character names should be
1707specified as a comment (following C<#>) on each line.
45394607
JH
1708
1709=item undefChar
1710
3756e7ca
RGS
1711=item undefName
1712
4d36a948 1713-- see 6.3.4 Reducing the Repertoire, UTS #10.
45394607 1714
584e761d 1715Undefines the collation element as if it were unassigned in the C<table>.
45394607
JH
1716This reduces the size of the table.
1717If an unassigned character appears in the string to be collated,
1718the sort key is made from its codepoint
1719as a single-character collation element,
1720as it is greater than any other assigned collation elements
1721(in the codepoint order among the unassigned characters).
1722But, it'd be better to ignore characters
1723unfamiliar to you and maybe never used.
1724
3756e7ca
RGS
1725Through C<undefChar>, any character matching C<qr/$undefChar/>
1726will be undefined. Through C<undefName>, any character whose name
1727(given in the C<table> file as a comment) matches C<qr/$undefName/>
1728will be undefined.
1729
e7f779c8
RGS
1730ex. Collation weights for beyond-BMP characters are not stored in object:
1731
1732 undefChar => qr/[^\0-\x{fffd}]/,
1733
45394607
JH
1734=item upper_before_lower
1735
3756e7ca 1736-- see 6.6 Case Comparisons, UTS #10.
45394607 1737
3756e7ca
RGS
1738By default, lowercase is before uppercase.
1739If the parameter is made true, this is reversed.
45394607 1740
3756e7ca
RGS
1741B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1742distinctions must occur in level 3, and their weights at level 3 must be
1743same as those mentioned in 7.3.1, UTS #10.
1744If you define your collation elements which differs from this requirement,
1745this parameter doesn't work validly.
45394607 1746
91ae00cb
NC
1747=item variable
1748
91ae00cb
NC
1749-- see 3.2.2 Variable Weighting, UTS #10.
1750
91ae00cb
NC
1751This key allows to variable weighting for variable collation elements,
1752which are marked with an ASTERISK in the table
1753(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1754
1755 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1756
1757These names are case-insensitive.
1758By default (if specification is omitted), 'shifted' is adopted.
1759
1760 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1761 considered at the 4th level.
1762
abd1ec54 1763 'Non-Ignorable' Variable elements are not reset to ignorable.
91ae00cb
NC
1764
1765 'Shifted' Variable elements are made ignorable at levels 1 through 3
1766 their level 4 weight is replaced by the old level 1 weight.
1767 Level 4 weight for Non-Variable elements is 0xFFFF.
1768
1769 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1770 are trimmed.
1771
45394607
JH
1772=back
1773
3164dd77 1774=head2 Methods for Collation
45394607
JH
1775
1776=over 4
1777
5398038e 1778=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
45394607
JH
1779
1780Sorts a list of strings.
1781
5398038e 1782=item C<$result = $Collator-E<gt>cmp($a, $b)>
45394607
JH
1783
1784Returns 1 (when C<$a> is greater than C<$b>)
1785or 0 (when C<$a> is equal to C<$b>)
1786or -1 (when C<$a> is lesser than C<$b>).
1787
5398038e 1788=item C<$result = $Collator-E<gt>eq($a, $b)>
1789
1790=item C<$result = $Collator-E<gt>ne($a, $b)>
1791
1792=item C<$result = $Collator-E<gt>lt($a, $b)>
1793
1794=item C<$result = $Collator-E<gt>le($a, $b)>
1795
1796=item C<$result = $Collator-E<gt>gt($a, $b)>
1797
1798=item C<$result = $Collator-E<gt>ge($a, $b)>
1799
a7fbee98 1800They works like the same name operators as theirs.
5398038e 1801
1802 eq : whether $a is equal to $b.
1803 ne : whether $a is not equal to $b.
1804 lt : whether $a is lesser than $b.
1805 le : whether $a is lesser than $b or equal to $b.
1806 gt : whether $a is greater than $b.
1807 ge : whether $a is greater than $b or equal to $b.
1808
1809=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
45394607 1810
3756e7ca 1811-- see 4.3 Form Sort Key, UTS #10.
45394607
JH
1812
1813Returns a sort key.
1814
1815You compare the sort keys using a binary comparison
1816and get the result of the comparison of the strings using UCA.
1817
5398038e 1818 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
45394607
JH
1819
1820 is equivalent to
1821
5398038e 1822 $Collator->cmp($a, $b)
45394607 1823
a7fbee98
JH
1824=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1825
3756e7ca
RGS
1826Converts a sorting key into its representation form.
1827If C<UCA_Version> is 8, the output is slightly different.
1828
a7fbee98
JH
1829 use Unicode::Collate;
1830 my $c = Unicode::Collate->new();
1831 print $c->viewSortKey("Perl"),"\n";
1832
0116f5dc
JH
1833 # output:
1834 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1835 # Level 1 Level 2 Level 3 Level 4
1836
4d36a948
ST
1837=back
1838
1839=head2 Methods for Searching
d16e9e3d 1840
3756e7ca 1841B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
4d36a948
ST
1842for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1843C<subst>, C<gsubst>) is croaked,
1844as the position and the length might differ
1845from those on the specified string.
3756e7ca 1846(And C<rearrange> and C<hangul_terminator> parameters are neglected.)
d16e9e3d 1847
4d36a948
ST
1848The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1849like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1850but they are not aware of any pattern, but only a literal substring.
1851
1852=over 4
1853
1854=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1855
1856=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
d16e9e3d
JH
1857
1858If C<$substring> matches a part of C<$string>, returns
1859the position of the first occurrence of the matching part in scalar context;
1860in list context, returns a two-element list of
1861the position and the length of the matching part.
1862
d16e9e3d
JH
1863If C<$substring> does not match any part of C<$string>,
1864returns C<-1> in scalar context and
1865an empty list in list context.
1866
1867e.g. you say
1868
5398038e 1869 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
4d36a948
ST
1870 # (normalization => undef) is REQUIRED.
1871 my $str = "Ich muß studieren Perl.";
1872 my $sub = "MÜSS";
d16e9e3d 1873 my $match;
a7fbee98 1874 if (my($pos,$len) = $Collator->index($str, $sub)) {
5398038e 1875 $match = substr($str, $pos, $len);
d16e9e3d
JH
1876 }
1877
4d36a948 1878and get C<"muß"> in C<$match> since C<"muß">
3756e7ca 1879is primary equal to C<"MÜSS">.
4d36a948
ST
1880
1881=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1882
1883=item C<($match) = $Collator-E<gt>match($string, $substring)>
1884
1885If C<$substring> matches a part of C<$string>, in scalar context, returns
1886B<a reference to> the first occurrence of the matching part
1887(C<$match_ref> is always true if matches,
1888since every reference is B<true>);
1889in list context, returns the first occurrence of the matching part.
1890
1891If C<$substring> does not match any part of C<$string>,
1892returns C<undef> in scalar context and
1893an empty list in list context.
1894
1895e.g.
1896
1897 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1898 print "matches [$$match_ref].\n";
1899 } else {
1900 print "doesn't match.\n";
1901 }
1902
3756e7ca 1903 or
4d36a948
ST
1904
1905 if (($match) = $Collator->match($str, $sub)) { # list context
1906 print "matches [$match].\n";
1907 } else {
1908 print "doesn't match.\n";
1909 }
1910
1911=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1912
1913If C<$substring> matches a part of C<$string>, returns
1914all the matching parts (or matching count in scalar context).
1915
1916If C<$substring> does not match any part of C<$string>,
1917returns an empty list.
1918
1919=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1920
1921If C<$substring> matches a part of C<$string>,
1922the first occurrence of the matching part is replaced by C<$replacement>
1923(C<$string> is modified) and return C<$count> (always equals to C<1>).
1924
1925C<$replacement> can be a C<CODEREF>,
1926taking the matching part as an argument,
1927and returning a string to replace the matching part
1928(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1929
1930=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1931
1932If C<$substring> matches a part of C<$string>,
1933all the occurrences of the matching part is replaced by C<$replacement>
1934(C<$string> is modified) and return C<$count>.
1935
1936C<$replacement> can be a C<CODEREF>,
1937taking the matching part as an argument,
1938and returning a string to replace the matching part
1939(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1940
1941e.g.
1942
1943 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1944 # (normalization => undef) is REQUIRED.
3756e7ca 1945 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
4d36a948
ST
1946 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1947
3756e7ca 1948 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
4d36a948 1949 # i.e., all the camels are made bold-faced.
d16e9e3d 1950
45394607
JH
1951=back
1952
3164dd77
ST
1953=head2 Other Methods
1954
1955=over 4
1956
0116f5dc
JH
1957=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1958
68adb2b0
CBW
1959=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1960
0116f5dc
JH
1961Change the value of specified keys and returns the changed part.
1962
1963 $Collator = Unicode::Collate->new(level => 4);
1964
1965 $Collator->eq("perl", "PERL"); # false
1966
1967 %old = $Collator->change(level => 2); # returns (level => 4).
1968
1969 $Collator->eq("perl", "PERL"); # true
1970
1971 $Collator->change(%old); # returns (level => 2).
1972
1973 $Collator->eq("perl", "PERL"); # false
1974
1975Not all C<(key,value)>s are allowed to be changed.
1976See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1977
1978In the scalar context, returns the modified collator
1979(but it is B<not> a clone from the original).
1980
1981 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1982
1983 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1984
1985 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1986
91ae00cb 1987=item C<$version = $Collator-E<gt>version()>
3164dd77 1988
91ae00cb
NC
1989Returns the version number (a string) of the Unicode Standard
1990which the C<table> file used by the collator object is based on.
1991If the table does not include a version line (starting with C<@version>),
1992returns C<"unknown">.
1993
1994=item C<UCA_Version()>
3164dd77 1995
91ae00cb 1996Returns the tracking version number of UTS #10 this module consults.
b5d9a953
CBW
1997C<UCA_Version()> should return the tracking version corresponding
1998with the DUCET incorporated.
3164dd77 1999
91ae00cb
NC
2000=item C<Base_Unicode_Version()>
2001
2002Returns the version number of UTS #10 this module consults.
3164dd77
ST
2003
2004=back
2005
3756e7ca
RGS
2006=head1 EXPORT
2007
2008No method will be exported.
45394607 2009
0d50d293
RGS
2010=head1 INSTALL
2011
2012Though this module can be used without any C<table> file,
2013to use this module easily, it is recommended to install a table file
2014in the UCA format, by copying it under the directory
2015<a place in @INC>/Unicode/Collate.
2016
6d24ed10
SP
2017The most preferable one is "The Default Unicode Collation Element Table"
2018(aka DUCET), available from the Unicode Consortium's website:
0d50d293
RGS
2019
2020 http://www.unicode.org/Public/UCA/
2021
2022 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
2023
2024If DUCET is not installed, it is recommended to copy the file
2025from http://www.unicode.org/Public/UCA/latest/allkeys.txt
2026to <a place in @INC>/Unicode/Collate/allkeys.txt
2027manually.
2028
3756e7ca
RGS
2029=head1 CAVEATS
2030
2031=over 4
45394607 2032
3756e7ca 2033=item Normalization
45394607 2034
3756e7ca
RGS
2035Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2036module (see L<Unicode::Normalize>).
45394607 2037
5398038e 2038If you need not it (say, in the case when you need not
45394607
JH
2039handle any combining characters),
2040assign C<normalization =E<gt> undef> explicitly.
2041
4d36a948 2042-- see 6.5 Avoiding Normalization, UTS #10.
5398038e 2043
3756e7ca 2044=item Conformance Test
0116f5dc 2045
10d7ec48
NC
2046The Conformance Test for the UCA is available
2047under L<http://www.unicode.org/Public/UCA/>.
0116f5dc
JH
2048
2049For F<CollationTest_SHIFTED.txt>,
2050a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2051for F<CollationTest_NON_IGNORABLE.txt>, a collator via
91ae00cb 2052C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
0116f5dc 2053
4d36a948 2054B<Unicode::Normalize is required to try The Conformance Test.>
a7fbee98 2055
3756e7ca
RGS
2056=back
2057
0d50d293 2058=head1 AUTHOR, COPYRIGHT AND LICENSE
45394607 2059
0d50d293 2060The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
74b94a79 2061<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2010,
0d50d293 2062SADAHIRO Tomoyuki. Japan. All rights reserved.
45394607 2063
0d50d293
RGS
2064This module is free software; you can redistribute it and/or
2065modify it under the same terms as Perl itself.
45394607 2066
00e00351
CBW
2067The file Unicode/Collate/allkeys.txt was copied verbatim
2068from L<http://www.unicode.org/Public/UCA/5.2.0/allkeys.txt>.
2069This file is Copyright (c) 1991-2009 Unicode, Inc. All rights reserved.
6d24ed10 2070Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
45394607
JH
2071
2072=head1 SEE ALSO
2073
2074=over 4
2075
91ae00cb
NC
2076=item Unicode Collation Algorithm - UTS #10
2077
2078L<http://www.unicode.org/reports/tr10/>
2079
2080=item The Default Unicode Collation Element Table (DUCET)
2081
10d7ec48 2082L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
45394607 2083
91ae00cb 2084=item The conformance test for the UCA
45394607 2085
10d7ec48 2086L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
a7fbee98 2087
10d7ec48 2088L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
45394607 2089
91ae00cb 2090=item Hangul Syllable Type
0116f5dc 2091
10d7ec48 2092L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
0116f5dc 2093
91ae00cb 2094=item Unicode Normalization Forms - UAX #15
a7fbee98 2095
91ae00cb 2096L<http://www.unicode.org/reports/tr15/>
a7fbee98 2097
aa7758f7
CBW
2098=item Unicode Locale Data Markup Language (LDML) - UTS #35
2099
2100L<http://www.unicode.org/reports/tr35/>
2101
45394607
JH
2102=back
2103
2104=cut