This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Document that symbols should be removed from metaconfig.h
[perl5.git] / cpan / Unicode-Collate / Collate.pm
... / ...
CommitLineData
1package Unicode::Collate;
2
3BEGIN {
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
6 }
7 unless (0x41 == unpack('U', 'A')) {
8 die "Unicode::Collate cannot get a Unicode code point\n";
9 }
10}
11
12use 5.006;
13use strict;
14use warnings;
15use Carp;
16use File::Spec;
17
18no warnings 'utf8';
19
20our $VERSION = '1.18';
21our $PACKAGE = __PACKAGE__;
22
23### begin XS only ###
24require DynaLoader;
25our @ISA = qw(DynaLoader);
26bootstrap Unicode::Collate $VERSION;
27### end XS only ###
28
29my @Path = qw(Unicode Collate);
30my $KeyFile = "allkeys.txt";
31
32# Perl's boolean
33use constant TRUE => 1;
34use constant FALSE => "";
35use constant NOMATCHPOS => -1;
36
37# A coderef to get combining class imported from Unicode::Normalize
38# (i.e. \&Unicode::Normalize::getCombinClass).
39# This is also used as a HAS_UNICODE_NORMALIZE flag.
40my $CVgetCombinClass;
41
42# Supported Levels
43use constant MinLevel => 1;
44use constant MaxLevel => 4;
45
46# Minimum weights at level 2 and 3, respectively
47use constant Min2Wt => 0x20;
48use constant Min3Wt => 0x02;
49
50# Shifted weight at 4th level
51use constant Shift4Wt => 0xFFFF;
52
53# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
54use constant VCE_TEMPLATE => 'Cn4';
55
56# A sort key: 16-bit weights
57use constant KEY_TEMPLATE => 'n*';
58
59# The tie-breaking: 32-bit weights
60use constant TIE_TEMPLATE => 'N*';
61
62# Level separator in a sort key:
63# i.e. pack(KEY_TEMPLATE, 0)
64use constant LEVEL_SEP => "\0\0";
65
66# As Unicode code point separator for hash keys.
67# A joined code point string (denoted by JCPS below)
68# like "65;768" is used for internal processing
69# instead of Perl's Unicode string like "\x41\x{300}",
70# as the native code point is different from the Unicode code point
71# on EBCDIC platform.
72# This character must not be included in any stringified
73# representation of an integer.
74use constant CODE_SEP => ';';
75 # NOTE: in regex /;/ is used for $jcps!
76
77# boolean values of variable weights
78use constant NON_VAR => 0; # Non-Variable character
79use constant VAR => 1; # Variable character
80
81# specific code points
82use constant Hangul_SIni => 0xAC00;
83use constant Hangul_SFin => 0xD7A3;
84
85# Logical_Order_Exception in PropList.txt
86my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
87
88# for highestFFFF and minimalFFFE
89my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
90my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE);
91
92sub UCA_Version { "32" }
93
94sub Base_Unicode_Version { "8.0.0" }
95
96######
97
98sub pack_U {
99 return pack('U*', @_);
100}
101
102sub unpack_U {
103 return unpack('U*', shift(@_).pack('U*'));
104}
105
106######
107
108my (%VariableOK);
109@VariableOK{ qw/
110 blanked non-ignorable shifted shift-trimmed
111 / } = (); # keys lowercased
112
113our @ChangeOK = qw/
114 alternate backwards level normalization rearrange
115 katakana_before_hiragana upper_before_lower ignore_level2
116 overrideCJK overrideHangul overrideOut preprocess UCA_Version
117 hangul_terminator variable identical highestFFFF minimalFFFE
118 long_contraction
119 /;
120
121our @ChangeNG = qw/
122 entry mapping table maxlength contraction
123 ignoreChar ignoreName undefChar undefName rewrite
124 versionTable alternateTable backwardsTable forwardsTable
125 rearrangeTable variableTable
126 derivCode normCode rearrangeHash backwardsFlag
127 suppress suppressHash
128 __useXS /; ### XS only
129# The hash key 'ignored' was deleted at v 0.21.
130# The hash key 'isShift' was deleted at v 0.23.
131# The hash key 'combining' was deleted at v 0.24.
132# The hash key 'entries' was deleted at v 0.30.
133# The hash key 'L3_ignorable' was deleted at v 0.40.
134
135sub version {
136 my $self = shift;
137 return $self->{versionTable} || 'unknown';
138}
139
140my (%ChangeOK, %ChangeNG);
141@ChangeOK{ @ChangeOK } = ();
142@ChangeNG{ @ChangeNG } = ();
143
144sub change {
145 my $self = shift;
146 my %hash = @_;
147 my %old;
148 if (exists $hash{alternate}) {
149 if (exists $hash{variable}) {
150 delete $hash{alternate};
151 } else {
152 $hash{variable} = $hash{alternate};
153 }
154 }
155 foreach my $k (keys %hash) {
156 if (exists $ChangeOK{$k}) {
157 $old{$k} = $self->{$k};
158 $self->{$k} = $hash{$k};
159 } elsif (exists $ChangeNG{$k}) {
160 croak "change of $k via change() is not allowed!";
161 }
162 # else => ignored
163 }
164 $self->checkCollator();
165 return wantarray ? %old : $self;
166}
167
168sub _checkLevel {
169 my $level = shift;
170 my $key = shift; # 'level' or 'backwards'
171 MinLevel <= $level or croak sprintf
172 "Illegal level %d (in value for key '%s') lower than %d.",
173 $level, $key, MinLevel;
174 $level <= MaxLevel or croak sprintf
175 "Unsupported level %d (in value for key '%s') higher than %d.",
176 $level, $key, MaxLevel;
177}
178
179my %DerivCode = (
180 8 => \&_derivCE_8,
181 9 => \&_derivCE_9,
182 11 => \&_derivCE_9, # 11 == 9
183 14 => \&_derivCE_14,
184 16 => \&_derivCE_14, # 16 == 14
185 18 => \&_derivCE_18,
186 20 => \&_derivCE_20,
187 22 => \&_derivCE_22,
188 24 => \&_derivCE_24,
189 26 => \&_derivCE_24, # 26 == 24
190 28 => \&_derivCE_24, # 28 == 24
191 30 => \&_derivCE_24, # 30 == 24
192 32 => \&_derivCE_32,
193);
194
195sub checkCollator {
196 my $self = shift;
197 _checkLevel($self->{level}, "level");
198
199 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
200 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
201
202 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
203 $self->{alternateTable} || 'shifted';
204 $self->{variable} = $self->{alternate} = lc($self->{variable});
205 exists $VariableOK{ $self->{variable} }
206 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
207
208 if (! defined $self->{backwards}) {
209 $self->{backwardsFlag} = 0;
210 } elsif (! ref $self->{backwards}) {
211 _checkLevel($self->{backwards}, "backwards");
212 $self->{backwardsFlag} = 1 << $self->{backwards};
213 } else {
214 my %level;
215 $self->{backwardsFlag} = 0;
216 for my $b (@{ $self->{backwards} }) {
217 _checkLevel($b, "backwards");
218 $level{$b} = 1;
219 }
220 for my $v (sort keys %level) {
221 $self->{backwardsFlag} += 1 << $v;
222 }
223 }
224
225 defined $self->{rearrange} or $self->{rearrange} = [];
226 ref $self->{rearrange}
227 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
228
229 # keys of $self->{rearrangeHash} are $self->{rearrange}.
230 $self->{rearrangeHash} = undef;
231
232 if (@{ $self->{rearrange} }) {
233 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
234 }
235
236 $self->{normCode} = undef;
237
238 if (defined $self->{normalization}) {
239 eval { require Unicode::Normalize };
240 $@ and croak "Unicode::Normalize is required to normalize strings";
241
242 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
243
244 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
245 $self->{normCode} = \&Unicode::Normalize::NFD;
246 }
247 elsif ($self->{normalization} ne 'prenormalized') {
248 my $norm = $self->{normalization};
249 $self->{normCode} = sub {
250 Unicode::Normalize::normalize($norm, shift);
251 };
252 eval { $self->{normCode}->("") }; # try
253 $@ and croak "$PACKAGE unknown normalization form name: $norm";
254 }
255 }
256 return;
257}
258
259sub new
260{
261 my $class = shift;
262 my $self = bless { @_ }, $class;
263
264### begin XS only ###
265 if (! exists $self->{table} && !defined $self->{rewrite} &&
266 !defined $self->{undefName} && !defined $self->{ignoreName} &&
267 !defined $self->{undefChar} && !defined $self->{ignoreChar}) {
268 $self->{__useXS} = \&_fetch_simple;
269 } else {
270 $self->{__useXS} = undef;
271 }
272### end XS only ###
273
274 # keys of $self->{suppressHash} are $self->{suppress}.
275 if ($self->{suppress} && @{ $self->{suppress} }) {
276 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
277 } # before read_table()
278
279 # If undef is passed explicitly, no file is read.
280 $self->{table} = $KeyFile if ! exists $self->{table};
281 $self->read_table() if defined $self->{table};
282
283 if ($self->{entry}) {
284 while ($self->{entry} =~ /([^\n]+)/g) {
285 $self->parseEntry($1, TRUE);
286 }
287 }
288
289 # only in new(), not in change()
290 $self->{level} ||= MaxLevel;
291 $self->{UCA_Version} ||= UCA_Version();
292
293 $self->{overrideHangul} = FALSE
294 if ! exists $self->{overrideHangul};
295 $self->{overrideCJK} = FALSE
296 if ! exists $self->{overrideCJK};
297 $self->{normalization} = 'NFD'
298 if ! exists $self->{normalization};
299 $self->{rearrange} = $self->{rearrangeTable} ||
300 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
301 if ! exists $self->{rearrange};
302 $self->{backwards} = $self->{backwardsTable}
303 if ! exists $self->{backwards};
304 exists $self->{long_contraction} or $self->{long_contraction}
305 = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
306
307 # checkCollator() will be called in change()
308 $self->checkCollator();
309
310 return $self;
311}
312
313sub parseAtmark {
314 my $self = shift;
315 my $line = shift; # after s/^\s*\@//
316
317 if ($line =~ /^version\s*(\S*)/) {
318 $self->{versionTable} ||= $1;
319 }
320 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
321 $self->{variableTable} ||= $1;
322 }
323 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
324 $self->{alternateTable} ||= $1;
325 }
326 elsif ($line =~ /^backwards\s+(\S*)/) {
327 push @{ $self->{backwardsTable} }, $1;
328 }
329 elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
330 push @{ $self->{forwardsTable} }, $1;
331 }
332 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
333 push @{ $self->{rearrangeTable} }, _getHexArray($1);
334 }
335}
336
337sub read_table {
338 my $self = shift;
339
340### begin XS only ###
341 if ($self->{__useXS}) {
342 my @rest = _fetch_rest(); # complex matter need to parse
343 for my $line (@rest) {
344 next if $line =~ /^\s*#/;
345
346 if ($line =~ s/^\s*\@//) {
347 $self->parseAtmark($line);
348 } else {
349 $self->parseEntry($line);
350 }
351 }
352 return;
353 }
354### end XS only ###
355
356 my($f, $fh);
357 foreach my $d (@INC) {
358 $f = File::Spec->catfile($d, @Path, $self->{table});
359 last if open($fh, $f);
360 $f = undef;
361 }
362 if (!defined $f) {
363 $f = File::Spec->catfile(@Path, $self->{table});
364 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
365 }
366
367 while (my $line = <$fh>) {
368 next if $line =~ /^\s*#/;
369
370 if ($line =~ s/^\s*\@//) {
371 $self->parseAtmark($line);
372 } else {
373 $self->parseEntry($line);
374 }
375 }
376 close $fh;
377}
378
379
380##
381## get $line, parse it, and write an entry in $self
382##
383sub parseEntry
384{
385 my $self = shift;
386 my $line = shift;
387 my $tailoring = shift;
388 my($name, $entry, @uv, @key);
389
390 if (defined $self->{rewrite}) {
391 $line = $self->{rewrite}->($line);
392 }
393
394 return if $line !~ /^\s*[0-9A-Fa-f]/;
395
396 # removes comment and gets name
397 $name = $1
398 if $line =~ s/[#%]\s*(.*)//;
399 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
400
401 # gets element
402 my($e, $k) = split /;/, $line;
403 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
404 if ! $k;
405
406 @uv = _getHexArray($e);
407 return if !@uv;
408 return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
409 exists $self->{suppressHash}{$uv[0]};
410 $entry = join(CODE_SEP, @uv); # in JCPS
411
412 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
413 my $ele = pack_U(@uv);
414
415 # regarded as if it were not stored in the table
416 return
417 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
418
419 # replaced as completely ignorable
420 $k = '[.0000.0000.0000.0000]'
421 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
422 }
423
424 # replaced as completely ignorable
425 $k = '[.0000.0000.0000.0000]'
426 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
427
428 my $is_L3_ignorable = TRUE;
429
430 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
431 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
432 my @wt = _getHexArray($arr);
433 push @key, pack(VCE_TEMPLATE, $var, @wt);
434 $is_L3_ignorable = FALSE
435 if $wt[0] || $wt[1] || $wt[2];
436 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
437 # is completely ignorable.
438 # For expansion, an entry $is_L3_ignorable
439 # if and only if "all" CEs are [.0000.0000.0000].
440 }
441
442 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
443
444 if (@uv > 1) {
445 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
446 $self->{maxlength}{$uv[0]} = @uv;
447 }
448 }
449 while (@uv > 2) {
450 pop @uv;
451 my $fake_entry = join(CODE_SEP, @uv); # in JCPS
452 $self->{contraction}{$fake_entry} = 1;
453 }
454}
455
456
457sub viewSortKey
458{
459 my $self = shift;
460 my $str = shift;
461 $self->visualizeSortKey($self->getSortKey($str));
462}
463
464
465sub process
466{
467 my $self = shift;
468 my $str = shift;
469 my $prep = $self->{preprocess};
470 my $norm = $self->{normCode};
471
472 $str = &$prep($str) if ref $prep;
473 $str = &$norm($str) if ref $norm;
474 return $str;
475}
476
477##
478## arrayref of JCPS = splitEnt(string to be collated)
479## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
480##
481sub splitEnt
482{
483 my $self = shift;
484 my $str = shift;
485 my $wLen = shift; # with Length
486
487 my $map = $self->{mapping};
488 my $max = $self->{maxlength};
489 my $reH = $self->{rearrangeHash};
490 my $vers = $self->{UCA_Version};
491 my $ver9 = $vers >= 9 && $vers <= 11;
492 my $long = $self->{long_contraction};
493 my $uXS = $self->{__useXS}; ### XS only
494
495 my @buf;
496
497 # get array of Unicode code point of string.
498 my @src = unpack_U($str);
499
500 # rearrangement:
501 # Character positions are not kept if rearranged,
502 # then neglected if $wLen is true.
503 if ($reH && ! $wLen) {
504 for (my $i = 0; $i < @src; $i++) {
505 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
506 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
507 $i++;
508 }
509 }
510 }
511
512 # remove a code point marked as a completely ignorable.
513 for (my $i = 0; $i < @src; $i++) {
514 if ($vers <= 20 && _isIllegal($src[$i])) {
515 $src[$i] = undef;
516 } elsif ($ver9) {
517 $src[$i] = undef if $map->{ $src[$i] }
518 ? @{ $map->{ $src[$i] } } == 0
519 : $uXS && _ignorable_simple($src[$i]); ### XS only
520 }
521 }
522
523 for (my $i = 0; $i < @src; $i++) {
524 my $jcps = $src[$i];
525
526 # skip removed code point
527 if (! defined $jcps) {
528 if ($wLen && @buf) {
529 $buf[-1][2] = $i + 1;
530 }
531 next;
532 }
533
534 my $i_orig = $i;
535
536 # find contraction
537 if ($max->{$jcps}) {
538 my $temp_jcps = $jcps;
539 my $jcpsLen = 1;
540 my $maxLen = $max->{$jcps};
541
542 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
543 next if ! defined $src[$p];
544 $temp_jcps .= CODE_SEP . $src[$p];
545 $jcpsLen++;
546 if ($map->{$temp_jcps}) {
547 $jcps = $temp_jcps;
548 $i = $p;
549 }
550 }
551
552 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
553 # This process requires Unicode::Normalize.
554 # If "normalization" is undef, here should be skipped *always*
555 # (in spite of bool value of $CVgetCombinClass),
556 # since canonical ordering cannot be expected.
557 # Blocked combining character should not be contracted.
558
559 # $self->{normCode} is false in the case of "prenormalized".
560 if ($self->{normalization}) {
561 my $cont = $self->{contraction};
562 my $preCC = 0;
563 my $preCC_uc = 0;
564 my $jcps_uc = $jcps;
565 my(@out, @out_uc);
566
567 for (my $p = $i + 1; $p < @src; $p++) {
568 next if ! defined $src[$p];
569 my $curCC = $CVgetCombinClass->($src[$p]);
570 last unless $curCC;
571 my $tail = CODE_SEP . $src[$p];
572
573 if ($preCC != $curCC && $map->{$jcps.$tail}) {
574 $jcps .= $tail;
575 push @out, $p;
576 } else {
577 $preCC = $curCC;
578 }
579
580 next if !$long;
581
582 if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
583 $cont->{$jcps_uc.$tail})) {
584 $jcps_uc .= $tail;
585 push @out_uc, $p;
586 } else {
587 $preCC_uc = $curCC;
588 }
589 }
590
591 if (@out_uc && $map->{$jcps_uc}) {
592 $jcps = $jcps_uc;
593 $src[$_] = undef for @out_uc;
594 } else {
595 $src[$_] = undef for @out;
596 }
597 }
598 }
599
600 # skip completely ignorable
601 if ($map->{$jcps} ? @{ $map->{$jcps} } == 0 :
602 $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
603 if ($wLen && @buf) {
604 $buf[-1][2] = $i + 1;
605 }
606 next;
607 }
608
609 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
610 }
611 return \@buf;
612}
613
614##
615## VCE = _pack_override(input, codepoint, derivCode)
616##
617sub _pack_override ($$$) {
618 my $r = shift;
619 my $u = shift;
620 my $der = shift;
621
622 if (ref $r) {
623 return pack(VCE_TEMPLATE, NON_VAR, @$r);
624 } elsif (defined $r) {
625 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
626 } else {
627 $u = 0xFFFD if 0x10FFFF < $u;
628 return $der->($u);
629 }
630}
631
632##
633## list of VCE = getWt(JCPS)
634##
635sub getWt
636{
637 my $self = shift;
638 my $u = shift;
639 my $map = $self->{mapping};
640 my $der = $self->{derivCode};
641 my $out = $self->{overrideOut};
642 my $uXS = $self->{__useXS}; ### XS only
643
644 return if !defined $u;
645 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
646 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
647 $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
648
649 my @ce;
650 if ($map->{$u}) {
651 @ce = @{ $map->{$u} }; # $u may be a contraction
652### begin XS only ###
653 } elsif ($uXS && _exists_simple($u)) {
654 @ce = _fetch_simple($u);
655### end XS only ###
656 } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
657 my $hang = $self->{overrideHangul};
658 if ($hang) {
659 @ce = map _pack_override($_, $u, $der), $hang->($u);
660 } elsif (!defined $hang) {
661 @ce = $der->($u);
662 } else {
663 my $max = $self->{maxlength};
664 my @decH = _decompHangul($u);
665
666 if (@decH == 2) {
667 my $contract = join(CODE_SEP, @decH);
668 @decH = ($contract) if $map->{$contract};
669 } else { # must be <@decH == 3>
670 if ($max->{$decH[0]}) {
671 my $contract = join(CODE_SEP, @decH);
672 if ($map->{$contract}) {
673 @decH = ($contract);
674 } else {
675 $contract = join(CODE_SEP, @decH[0,1]);
676 $map->{$contract} and @decH = ($contract, $decH[2]);
677 }
678 # even if V's ignorable, LT contraction is not supported.
679 # If such a situation were required, NFD should be used.
680 }
681 if (@decH == 3 && $max->{$decH[1]}) {
682 my $contract = join(CODE_SEP, @decH[1,2]);
683 $map->{$contract} and @decH = ($decH[0], $contract);
684 }
685 }
686
687 @ce = map({
688 $map->{$_} ? @{ $map->{$_} } :
689 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
690 $der->($_);
691 } @decH);
692 }
693 } elsif ($out && 0x10FFFF < $u) {
694 @ce = map _pack_override($_, $u, $der), $out->($u);
695 } else {
696 my $cjk = $self->{overrideCJK};
697 my $vers = $self->{UCA_Version};
698 if ($cjk && _isUIdeo($u, $vers)) {
699 @ce = map _pack_override($_, $u, $der), $cjk->($u);
700 } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
701 @ce = _uideoCE_8($u);
702 } else {
703 @ce = $der->($u);
704 }
705 }
706 return map $self->varCE($_), @ce;
707}
708
709
710##
711## string sortkey = getSortKey(string arg)
712##
713sub getSortKey
714{
715 my $self = shift;
716 my $orig = shift;
717 my $str = $self->process($orig);
718 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
719 my $vers = $self->{UCA_Version};
720 my $term = $self->{hangul_terminator};
721 my $lev = $self->{level};
722 my $iden = $self->{identical};
723
724 my @buf; # weight arrays
725 if ($term) {
726 my $preHST = '';
727 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
728 foreach my $jcps (@$rEnt) {
729 # weird things like VL, TL-contraction are not considered!
730 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
731 if ($preHST && !$curHST || # hangul before non-hangul
732 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
733 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
734 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
735 push @buf, $termCE;
736 }
737 $preHST = $curHST;
738 push @buf, $self->getWt($jcps);
739 }
740 push @buf, $termCE if $preHST; # end at hangul
741 } else {
742 foreach my $jcps (@$rEnt) {
743 push @buf, $self->getWt($jcps);
744 }
745 }
746
747 my $rkey = $self->mk_SortKey(\@buf); ### XS only
748
749 if ($iden || $vers >= 26 && $lev == MaxLevel) {
750 $rkey .= LEVEL_SEP;
751 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
752 }
753 return $rkey;
754}
755
756
757##
758## int compare = cmp(string a, string b)
759##
760sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
761sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
762sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
763sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
764sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
765sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
766sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
767
768##
769## list[strings] sorted = sort(list[strings] arg)
770##
771sub sort {
772 my $obj = shift;
773 return
774 map { $_->[1] }
775 sort{ $a->[0] cmp $b->[0] }
776 map [ $obj->getSortKey($_), $_ ], @_;
777}
778
779
780##
781## bool _nonIgnorAtLevel(arrayref weights, int level)
782##
783sub _nonIgnorAtLevel($$)
784{
785 my $wt = shift;
786 return if ! defined $wt;
787 my $lv = shift;
788 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
789}
790
791##
792## bool _eqArray(
793## arrayref of arrayref[weights] source,
794## arrayref of arrayref[weights] substr,
795## int level)
796## * comparison of graphemes vs graphemes.
797## @$source >= @$substr must be true (check it before call this);
798##
799sub _eqArray($$$)
800{
801 my $source = shift;
802 my $substr = shift;
803 my $lev = shift;
804
805 for my $g (0..@$substr-1){
806 # Do the $g'th graphemes have the same number of AV weights?
807 return if @{ $source->[$g] } != @{ $substr->[$g] };
808
809 for my $w (0..@{ $substr->[$g] }-1) {
810 for my $v (0..$lev-1) {
811 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
812 }
813 }
814 }
815 return 1;
816}
817
818##
819## (int position, int length)
820## int position = index(string, substring, position, [undoc'ed global])
821##
822## With "global" (only for the list context),
823## returns list of arrayref[position, length].
824##
825sub index
826{
827 my $self = shift;
828 $self->{preprocess} and
829 croak "Don't use Preprocess with index(), match(), etc.";
830 $self->{normCode} and
831 croak "Don't use Normalization with index(), match(), etc.";
832
833 my $str = shift;
834 my $len = length($str);
835 my $sub = shift;
836 my $subE = $self->splitEnt($sub);
837 my $pos = @_ ? shift : 0;
838 $pos = 0 if $pos < 0;
839 my $glob = shift;
840
841 my $lev = $self->{level};
842 my $v2i = $self->{UCA_Version} >= 9 &&
843 $self->{variable} ne 'non-ignorable';
844
845 if (! @$subE) {
846 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
847 return $glob
848 ? map([$_, 0], $temp..$len)
849 : wantarray ? ($temp,0) : $temp;
850 }
851 $len < $pos
852 and return wantarray ? () : NOMATCHPOS;
853 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
854 @$strE
855 or return wantarray ? () : NOMATCHPOS;
856
857 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
858
859 my $last_is_variable;
860 for my $vwt (map $self->getWt($_), @$subE) {
861 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
862 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
863
864 # "Ignorable (L1, L2) after Variable" since track. v. 9
865 if ($v2i) {
866 if ($var) {
867 $last_is_variable = TRUE;
868 }
869 elsif (!$wt[0]) { # ignorable
870 $to_be_pushed = FALSE if $last_is_variable;
871 }
872 else {
873 $last_is_variable = FALSE;
874 }
875 }
876
877 if (@subWt && !$var && !$wt[0]) {
878 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
879 } elsif ($to_be_pushed) {
880 push @subWt, [ \@wt ];
881 }
882 # else ===> skipped
883 }
884
885 my $count = 0;
886 my $end = @$strE - 1;
887
888 $last_is_variable = FALSE; # reuse
889 for (my $i = 0; $i <= $end; ) { # no $i++
890 my $found_base = 0;
891
892 # fetch a grapheme
893 while ($i <= $end && $found_base == 0) {
894 for my $vwt ($self->getWt($strE->[$i][0])) {
895 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
896 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
897
898 # "Ignorable (L1, L2) after Variable" since track. v. 9
899 if ($v2i) {
900 if ($var) {
901 $last_is_variable = TRUE;
902 }
903 elsif (!$wt[0]) { # ignorable
904 $to_be_pushed = FALSE if $last_is_variable;
905 }
906 else {
907 $last_is_variable = FALSE;
908 }
909 }
910
911 if (@strWt && !$var && !$wt[0]) {
912 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
913 $finPos[-1] = $strE->[$i][2];
914 } elsif ($to_be_pushed) {
915 push @strWt, [ \@wt ];
916 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
917 $finPos[-1] = NOMATCHPOS if $found_base;
918 push @finPos, $strE->[$i][2];
919 $found_base++;
920 }
921 # else ===> no-op
922 }
923 $i++;
924 }
925
926 # try to match
927 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
928 if ($iniPos[0] != NOMATCHPOS &&
929 $finPos[$#subWt] != NOMATCHPOS &&
930 _eqArray(\@strWt, \@subWt, $lev)) {
931 my $temp = $iniPos[0] + $pos;
932
933 if ($glob) {
934 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
935 splice @strWt, 0, $#subWt;
936 splice @iniPos, 0, $#subWt;
937 splice @finPos, 0, $#subWt;
938 }
939 else {
940 return wantarray
941 ? ($temp, $finPos[$#subWt] - $iniPos[0])
942 : $temp;
943 }
944 }
945 shift @strWt;
946 shift @iniPos;
947 shift @finPos;
948 }
949 }
950
951 return $glob
952 ? @g_ret
953 : wantarray ? () : NOMATCHPOS;
954}
955
956##
957## scalarref to matching part = match(string, substring)
958##
959sub match
960{
961 my $self = shift;
962 if (my($pos,$len) = $self->index($_[0], $_[1])) {
963 my $temp = substr($_[0], $pos, $len);
964 return wantarray ? $temp : \$temp;
965 # An lvalue ref \substr should be avoided,
966 # since its value is affected by modification of its referent.
967 }
968 else {
969 return;
970 }
971}
972
973##
974## arrayref matching parts = gmatch(string, substring)
975##
976sub gmatch
977{
978 my $self = shift;
979 my $str = shift;
980 my $sub = shift;
981 return map substr($str, $_->[0], $_->[1]),
982 $self->index($str, $sub, 0, 'g');
983}
984
985##
986## bool subst'ed = subst(string, substring, replace)
987##
988sub subst
989{
990 my $self = shift;
991 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
992
993 if (my($pos,$len) = $self->index($_[0], $_[1])) {
994 if ($code) {
995 my $mat = substr($_[0], $pos, $len);
996 substr($_[0], $pos, $len, $code->($mat));
997 } else {
998 substr($_[0], $pos, $len, $_[2]);
999 }
1000 return TRUE;
1001 }
1002 else {
1003 return FALSE;
1004 }
1005}
1006
1007##
1008## int count = gsubst(string, substring, replace)
1009##
1010sub gsubst
1011{
1012 my $self = shift;
1013 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1014 my $cnt = 0;
1015
1016 # Replacement is carried out from the end, then use reverse.
1017 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1018 if ($code) {
1019 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1020 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1021 } else {
1022 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1023 }
1024 $cnt++;
1025 }
1026 return $cnt;
1027}
1028
10291;
1030__END__
1031
1032=head1 NAME
1033
1034Unicode::Collate - Unicode Collation Algorithm
1035
1036=head1 SYNOPSIS
1037
1038 use Unicode::Collate;
1039
1040 #construct
1041 $Collator = Unicode::Collate->new(%tailoring);
1042
1043 #sort
1044 @sorted = $Collator->sort(@not_sorted);
1045
1046 #compare
1047 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1048
1049B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1050according to Perl's Unicode support. See L<perlunicode>,
1051L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1052Otherwise you can use C<preprocess> or should decode them before.
1053
1054=head1 DESCRIPTION
1055
1056This module is an implementation of Unicode Technical Standard #10
1057(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1058
1059=head2 Constructor and Tailoring
1060
1061The C<new> method returns a collator object. If new() is called
1062with no parameters, the collator should do the default collation.
1063
1064 $Collator = Unicode::Collate->new(
1065 UCA_Version => $UCA_Version,
1066 alternate => $alternate, # alias for 'variable'
1067 backwards => $levelNumber, # or \@levelNumbers
1068 entry => $element,
1069 hangul_terminator => $term_primary_weight,
1070 highestFFFF => $bool,
1071 identical => $bool,
1072 ignoreName => qr/$ignoreName/,
1073 ignoreChar => qr/$ignoreChar/,
1074 ignore_level2 => $bool,
1075 katakana_before_hiragana => $bool,
1076 level => $collationLevel,
1077 long_contraction => $bool,
1078 minimalFFFE => $bool,
1079 normalization => $normalization_form,
1080 overrideCJK => \&overrideCJK,
1081 overrideHangul => \&overrideHangul,
1082 preprocess => \&preprocess,
1083 rearrange => \@charList,
1084 rewrite => \&rewrite,
1085 suppress => \@charList,
1086 table => $filename,
1087 undefName => qr/$undefName/,
1088 undefChar => qr/$undefChar/,
1089 upper_before_lower => $bool,
1090 variable => $variable,
1091 );
1092
1093=over 4
1094
1095=item UCA_Version
1096
1097If the revision (previously "tracking version") number of UCA is given,
1098behavior of that revision is emulated on collating.
1099If omitted, the return value of C<UCA_Version()> is used.
1100
1101The following revisions are supported. The default is 32.
1102
1103 UCA Unicode Standard DUCET (@version)
1104 -------------------------------------------------------
1105 8 3.1 3.0.1 (3.0.1d9)
1106 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1107 11 4.0 4.0.0 (4.0.0)
1108 14 4.1.0 4.1.0 (4.1.0)
1109 16 5.0 5.0.0 (5.0.0)
1110 18 5.1.0 5.1.0 (5.1.0)
1111 20 5.2.0 5.2.0 (5.2.0)
1112 22 6.0.0 6.0.0 (6.0.0)
1113 24 6.1.0 6.1.0 (6.1.0)
1114 26 6.2.0 6.2.0 (6.2.0)
1115 28 6.3.0 6.3.0 (6.3.0)
1116 30 7.0.0 7.0.0 (7.0.0)
1117 32 8.0.0 8.0.0 (8.0.0)
1118
1119* See below for C<long_contraction> with C<UCA_Version> 22 and 24.
1120
1121* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1122since C<UCA_Version> 22.
1123
1124* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
1125and can be overridden since C<UCA_Version> 22.
1126
1127* Fully ignorable characters were ignored, and would not interrupt
1128contractions with C<UCA_Version> 9 and 11.
1129
1130* Treatment of ignorables after variables and some behaviors
1131were changed at C<UCA_Version> 9.
1132
1133* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1134depend on C<UCA_Version>.
1135
1136* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1137C<hangul_terminator>.
1138
1139=item alternate
1140
1141-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1142
1143For backward compatibility, C<alternate> (old name) can be used
1144as an alias for C<variable>.
1145
1146=item backwards
1147
1148-- see 3.4 Backward Accents, UTS #10.
1149
1150 backwards => $levelNumber or \@levelNumbers
1151
1152Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1153If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1154forwards at all the levels.
1155
1156=item entry
1157
1158-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
1159
1160If the same character (or a sequence of characters) exists
1161in the collation element table through C<table>,
1162mapping to collation elements is overridden.
1163If it does not exist, the mapping is defined additionally.
1164
1165 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
11660063 0068 ; [.0E6A.0020.0002.0063] # ch
11670043 0068 ; [.0E6A.0020.0007.0043] # Ch
11680043 0048 ; [.0E6A.0020.0008.0043] # CH
1169006C 006C ; [.0F4C.0020.0002.006C] # ll
1170004C 006C ; [.0F4C.0020.0007.004C] # Ll
1171004C 004C ; [.0F4C.0020.0008.004C] # LL
117200F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1173006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
117400D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1175004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1176ENTRY
1177
1178 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
117900E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
118000C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1181ENTRY
1182
1183B<NOTE:> The code point in the UCA file format (before C<';'>)
1184B<must> be a Unicode code point (defined as hexadecimal),
1185but not a native code point.
1186So C<0063> must always denote C<U+0063>,
1187but not a character of C<"\x63">.
1188
1189Weighting may vary depending on collation element table.
1190So ensure the weights defined in C<entry> will be consistent with
1191those in the collation element table loaded via C<table>.
1192
1193In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1194and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1195(as a value between C<0E60> and C<0E6D>)
1196makes ordering as C<C E<lt> CH E<lt> D>.
1197Exactly speaking DUCET already has some characters between C<C> and C<D>:
1198C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1199C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1200and C<c-curl> (C<U+0255>) with C<0E69>.
1201Then primary weight C<0E6A> for C<CH> makes C<CH>
1202ordered between C<c-curl> and C<D>.
1203
1204=item hangul_terminator
1205
1206-- see 7.1.4 Trailing Weights, UTS #10.
1207
1208If a true value is given (non-zero but should be positive),
1209it will be added as a terminator primary weight to the end of
1210every standard Hangul syllable. Secondary and any higher weights
1211for terminator are set to zero.
1212If the value is false or C<hangul_terminator> key does not exist,
1213insertion of terminator weights will not be performed.
1214
1215Boundaries of Hangul syllables are determined
1216according to conjoining Jamo behavior in F<the Unicode Standard>
1217and F<HangulSyllableType.txt>.
1218
1219B<Implementation Note:>
1220(1) For expansion mapping (Unicode character mapped
1221to a sequence of collation elements), a terminator will not be added
1222between collation elements, even if Hangul syllable boundary exists there.
1223Addition of terminator is restricted to the next position
1224to the last collation element.
1225
1226(2) Non-conjoining Hangul letters
1227(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1228automatically terminated with a terminator primary weight.
1229These characters may need terminator included in a collation element
1230table beforehand.
1231
1232=item highestFFFF
1233
1234-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation.
1235
1236If the parameter is made true, C<U+FFFF> has a highest primary weight.
1237When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1238C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1239begins with C<"abc">, or another primary equivalent.
1240C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1241such as C<"abc\x{FFFF}xyz">.
1242
1243C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
1244almost, but the latter has a problem that you should know which letter is
1245next to C<c>. For a certain language where C<ch> as the next letter,
1246C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
1247
1248Note:
1249This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
1250Any other character than C<U+FFFF> can be tailored by C<entry>.
1251
1252=item identical
1253
1254-- see A.3 Deterministic Comparison, UTS #10.
1255
1256By default, strings whose weights are equal should be equal,
1257even though their code points are not equal.
1258Completely ignorable characters are ignored.
1259
1260If the parameter is made true, a final, tie-breaking level is used.
1261If no difference of weights is found after the comparison through
1262all the level specified by C<level>, the comparison with code points
1263will be performed.
1264For the tie-breaking comparison, the sort key has code points
1265of the original string appended.
1266Completely ignorable characters are not ignored.
1267
1268If C<preprocess> and/or C<normalization> is applied, the code points
1269of the string after them (in NFD by default) are used.
1270
1271=item ignoreChar
1272
1273=item ignoreName
1274
1275-- see 3.6 Variable Weighting, UTS #10.
1276
1277Makes the entry in the table completely ignorable;
1278i.e. as if the weights were zero at all level.
1279
1280Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1281will be ignored. Through C<ignoreName>, any character whose name
1282(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1283will be ignored.
1284
1285E.g. when 'a' and 'e' are ignorable,
1286'element' is equal to 'lament' (or 'lmnt').
1287
1288=item ignore_level2
1289
1290-- see 5.1 Parametric Tailoring, UTS #10.
1291
1292By default, case-sensitive comparison (that is level 3 difference)
1293won't ignore accents (that is level 2 difference).
1294
1295If the parameter is made true, accents (and other primary ignorable
1296characters) are ignored, even though cases are taken into account.
1297
1298B<NOTE>: C<level> should be 3 or greater.
1299
1300=item katakana_before_hiragana
1301
1302-- see 7.2 Tertiary Weight Table, UTS #10.
1303
1304By default, hiragana is before katakana.
1305If the parameter is made true, this is reversed.
1306
1307B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1308distinctions must occur in level 3, and their weights at level 3 must be
1309same as those mentioned in 7.3.1, UTS #10.
1310If you define your collation elements which violate this requirement,
1311this parameter does not work validly.
1312
1313=item level
1314
1315-- see 4.3 Form Sort Key, UTS #10.
1316
1317Set the maximum level.
1318Any higher levels than the specified one are ignored.
1319
1320 Level 1: alphabetic ordering
1321 Level 2: diacritic ordering
1322 Level 3: case ordering
1323 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1324
1325 ex.level => 2,
1326
1327If omitted, the maximum is the 4th.
1328
1329B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1330But this module only uses weights within 0xFFFF.
1331When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1332and 'shift-trimmed'), the level 4 may be unreliable.
1333
1334See also C<identical>.
1335
1336=item long_contraction
1337
1338-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
1339
1340If the parameter is made true, for a contraction with three or more
1341characters (here nicknamed "long contraction"), initial substrings
1342will be handled.
1343For example, a contraction ABC, where A is a starter, and B and C
1344are non-starters (character with non-zero combining character class),
1345will be detected even if there is not AB as a contraction.
1346
1347B<Default:> Usually false.
1348If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
1349is not specified in C<new()>, a true value is set implicitly.
1350This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
1351
1352C<change()> handles C<long_contraction> explicitly only.
1353If C<long_contraction> is not specified in C<change()>, even though
1354C<UCA_Version> is changed, C<long_contraction> will not be changed.
1355
1356B<Limitation:> Scanning non-starters is one-way (no back tracking).
1357If AB is found but not ABC is not found, other long contraction where
1358the first character is A and the second is not B may not be found.
1359
1360Under C<(normalization =E<gt> undef)>, detection step of discontiguous
1361contractions will be skipped.
1362
1363B<Note:> The following contractions in DUCET are not considered
1364in steps S2.1.1 to S2.1.3, where they are discontiguous.
1365
1366 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
1367 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
1368
1369For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
1370(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
1371In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
1372instead of C<0FB2 0F71 0F80>.
1373Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
1374contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
1375
1376=item minimalFFFE
1377
1378-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation.
1379
1380If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1381The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1382first compares C<$a1> and C<$b1> at level 1, and
1383then C<$a2> and C<$b2> at level 1, as followed.
1384
1385 "ab\x{FFFE}a"
1386 "Ab\x{FFFE}a"
1387 "ab\x{FFFE}c"
1388 "Ab\x{FFFE}c"
1389 "ab\x{FFFE}xyz"
1390 "abc\x{FFFE}def"
1391 "abc\x{FFFE}xYz"
1392 "aBc\x{FFFE}xyz"
1393 "abcX\x{FFFE}def"
1394 "abcx\x{FFFE}xyz"
1395 "b\x{FFFE}aaa"
1396 "bbb\x{FFFE}a"
1397
1398Note:
1399This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
1400Any other character than C<U+FFFE> can be tailored by C<entry>.
1401
1402=item normalization
1403
1404-- see 4.1 Normalize, UTS #10.
1405
1406If specified, strings are normalized before preparation of sort keys
1407(the normalization is executed after preprocess).
1408
1409A form name C<Unicode::Normalize::normalize()> accepts will be applied
1410as C<$normalization_form>.
1411Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1412See C<Unicode::Normalize::normalize()> for detail.
1413If omitted, C<'NFD'> is used.
1414
1415C<normalization> is performed after C<preprocess> (if defined).
1416
1417Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1418though they are not concerned with C<Unicode::Normalize::normalize()>.
1419
1420If C<undef> (not a string C<"undef">) is passed explicitly
1421as the value for this key,
1422any normalization is not carried out (this may make tailoring easier
1423if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1424only contiguous contractions are resolved;
1425e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1426C<A-cedilla-ring> would be primary equal to C<A>.
1427In this point,
1428C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1429B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1430
1431In the case of C<(normalization =E<gt> "prenormalized")>,
1432any normalization is not performed, but
1433discontiguous contractions with combining characters are performed.
1434Therefore
1435C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1436B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1437If source strings are finely prenormalized,
1438C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1439
1440Except C<(normalization =E<gt> undef)>,
1441B<Unicode::Normalize> is required (see also B<CAVEAT>).
1442
1443=item overrideCJK
1444
1445-- see 7.1 Derived Collation Elements, UTS #10.
1446
1447By default, CJK unified ideographs are ordered in Unicode codepoint
1448order, but those in the CJK Unified Ideographs block are less than
1449those in the CJK Unified Ideographs Extension A etc.
1450
1451 In the CJK Unified Ideographs block:
1452 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1453 U+4E00..U+9FBB if UCA_Version is 14 or 16.
1454 U+4E00..U+9FC3 if UCA_Version is 18.
1455 U+4E00..U+9FCB if UCA_Version is 20 or 22.
1456 U+4E00..U+9FCC if UCA_Version is 24 to 30.
1457 U+4E00..U+9FD5 if UCA_Version is 32.
1458
1459 In the CJK Unified Ideographs Extension blocks:
1460 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
1461 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
1462 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
1463 Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32.
1464
1465Through C<overrideCJK>, ordering of CJK unified ideographs (including
1466extensions) can be overridden.
1467
1468ex. CJK unified ideographs in the JIS code point order.
1469
1470 overrideCJK => sub {
1471 my $u = shift; # get a Unicode codepoint
1472 my $b = pack('n', $u); # to UTF-16BE
1473 my $s = your_unicode_to_sjis_converter($b); # convert
1474 my $n = unpack('n', $s); # convert sjis to short
1475 [ $n, 0x20, 0x2, $u ]; # return the collation element
1476 },
1477
1478The return value may be an arrayref of 1st to 4th weights as shown
1479above. The return value may be an integer as the primary weight
1480as shown below. If C<undef> is returned, the default derived
1481collation element will be used.
1482
1483 overrideCJK => sub {
1484 my $u = shift; # get a Unicode codepoint
1485 my $b = pack('n', $u); # to UTF-16BE
1486 my $s = your_unicode_to_sjis_converter($b); # convert
1487 my $n = unpack('n', $s); # convert sjis to short
1488 return $n; # return the primary weight
1489 },
1490
1491The return value may be a list containing zero or more of
1492an arrayref, an integer, or C<undef>.
1493
1494ex. ignores all CJK unified ideographs.
1495
1496 overrideCJK => sub {()}, # CODEREF returning empty list
1497
1498 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1499 # as U+4E00 is a CJK unified ideograph and to be ignorable.
1500
1501If a false value (including C<undef>) is passed, C<overrideCJK>
1502has no effect.
1503C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
1504
1505But assignment of weight for CJK unified ideographs
1506in C<table> or C<entry> is still valid.
1507If C<undef> is passed explicitly as the value for this key,
1508weights for CJK unified ideographs are treated as undefined.
1509However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
1510has no special meaning.
1511
1512B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1513C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1514C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1515ideographs. But they can't be overridden via C<overrideCJK> when you use
1516DUCET, as the table includes weights for them. C<table> or C<entry> has
1517priority over C<overrideCJK>.
1518
1519=item overrideHangul
1520
1521-- see 7.1 Derived Collation Elements, UTS #10.
1522
1523By default, Hangul syllables are decomposed into Hangul Jamo,
1524even if C<(normalization =E<gt> undef)>.
1525But the mapping of Hangul syllables may be overridden.
1526
1527This parameter works like C<overrideCJK>, so see there for examples.
1528
1529If you want to override the mapping of Hangul syllables,
1530NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1531Hangul syllables before overriding. FCD may decompose Hangul syllables
1532as the case may be.
1533
1534If a false value (but not C<undef>) is passed, C<overrideHangul>
1535has no effect.
1536C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
1537
1538If C<undef> is passed explicitly as the value for this key,
1539weight for Hangul syllables is treated as undefined
1540without decomposition into Hangul Jamo.
1541But definition of weight for Hangul syllables
1542in C<table> or C<entry> is still valid.
1543
1544=item overrideOut
1545
1546-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
1547
1548Perl seems to allow out-of-range values (greater than 0x10FFFF).
1549By default, out-of-range values are replaced with C<U+FFFD>
1550(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
1551or ignored when C<UCA_Version> E<lt>= 20.
1552
1553When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
1554can be overridden. Though C<table> or C<entry> are available for them,
1555out-of-range values are too many.
1556
1557C<overrideOut> can perform it algorithmically.
1558This parameter works like C<overrideCJK>, so see there for examples.
1559
1560ex. ignores all out-of-range values.
1561
1562 overrideOut => sub {()}, # CODEREF returning empty list
1563
1564If a false value (including C<undef>) is passed, C<overrideOut>
1565has no effect.
1566C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
1567
1568B<NOTE ABOUT U+FFFD:>
1569
1570UCA recommends that out-of-range values should not be ignored for security
1571reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
1572However, C<U+FFFD> is wrongly mapped to a variable collation element
1573in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
1574ignored when C<variable> isn't C<Non-ignorable>.
1575
1576The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
1577see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
1578(7.1.4 Trailing Weights). Such a correction is reproduced by this.
1579
1580 overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
1581
1582This workaround is unnecessary since Unicode 6.3.0.
1583
1584=item preprocess
1585
1586-- see 5.4 Preprocessing, UTS #10.
1587
1588If specified, the coderef is used to preprocess each string
1589before the formation of sort keys.
1590
1591ex. dropping English articles, such as "a" or "the".
1592Then, "the pen" is before "a pencil".
1593
1594 preprocess => sub {
1595 my $str = shift;
1596 $str =~ s/\b(?:an?|the)\s+//gi;
1597 return $str;
1598 },
1599
1600C<preprocess> is performed before C<normalization> (if defined).
1601
1602ex. decoding strings in a legacy encoding such as shift-jis:
1603
1604 $sjis_collator = Unicode::Collate->new(
1605 preprocess => \&your_shiftjis_to_unicode_decoder,
1606 );
1607 @result = $sjis_collator->sort(@shiftjis_strings);
1608
1609B<Note:> Strings returned from the coderef will be interpreted
1610according to Perl's Unicode support. See L<perlunicode>,
1611L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1612
1613=item rearrange
1614
1615-- see 3.5 Rearrangement, UTS #10.
1616
1617Characters that are not coded in logical order and to be rearranged.
1618If C<UCA_Version> is equal to or less than 11, default is:
1619
1620 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1621
1622If you want to disallow any rearrangement, pass C<undef> or C<[]>
1623(a reference to empty list) as the value for this key.
1624
1625If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1626(i.e. no rearrangement).
1627
1628B<According to the version 9 of UCA, this parameter shall not be used;
1629but it is not warned at present.>
1630
1631=item rewrite
1632
1633If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1634The coderef will get each line, and then should return a rewritten line
1635according to the UCA file format.
1636If the coderef returns an empty line, the line will be skipped.
1637
1638e.g. any primary ignorable characters into tertiary ignorable:
1639
1640 rewrite => sub {
1641 my $line = shift;
1642 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1643 return $line;
1644 },
1645
1646This example shows rewriting weights. C<rewrite> is allowed to
1647affect code points, weights, and the name.
1648
1649B<NOTE>: C<table> is available to use another table file;
1650preparing a modified table once would be more efficient than
1651rewriting lines on reading an unmodified table every time.
1652
1653=item suppress
1654
1655-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation.
1656
1657Contractions beginning with the specified characters are suppressed,
1658even if those contractions are defined in C<table>.
1659
1660An example for Russian and some languages using the Cyrillic script:
1661
1662 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1663
1664where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1665
1666B<NOTE>: Contractions via C<entry> will not be suppressed.
1667
1668=item table
1669
1670-- see 3.8 Default Unicode Collation Element Table, UTS #10.
1671
1672You can use another collation element table if desired.
1673
1674The table file should locate in the F<Unicode/Collate> directory
1675on C<@INC>. Say, if the filename is F<Foo.txt>,
1676the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1677
1678By default, F<allkeys.txt> (as the filename of DUCET) is used.
1679If you will prepare your own table file, any name other than F<allkeys.txt>
1680may be better to avoid namespace conflict.
1681
1682B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1683module, and it may save time at the run time.
1684Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
1685or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1686C<rewrite> will prevent this module from using the compiled DUCET.
1687
1688If C<undef> is passed explicitly as the value for this key,
1689no file is read (but you can define collation elements via C<entry>).
1690
1691A typical way to define a collation element table
1692without any file of table:
1693
1694 $onlyABC = Unicode::Collate->new(
1695 table => undef,
1696 entry => << 'ENTRIES',
16970061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
16980041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
16990062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
17000042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
17010063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
17020043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1703ENTRIES
1704 );
1705
1706If C<ignoreName> or C<undefName> is used, character names should be
1707specified as a comment (following C<#>) on each line.
1708
1709=item undefChar
1710
1711=item undefName
1712
1713-- see 6.3.3 Reducing the Repertoire, UTS #10.
1714
1715Undefines the collation element as if it were unassigned in the C<table>.
1716This reduces the size of the table.
1717If an unassigned character appears in the string to be collated,
1718the sort key is made from its codepoint
1719as a single-character collation element,
1720as it is greater than any other assigned collation elements
1721(in the codepoint order among the unassigned characters).
1722But, it'd be better to ignore characters
1723unfamiliar to you and maybe never used.
1724
1725Through C<undefChar>, any character matching C<qr/$undefChar/>
1726will be undefined. Through C<undefName>, any character whose name
1727(given in the C<table> file as a comment) matches C<qr/$undefName/>
1728will be undefined.
1729
1730ex. Collation weights for beyond-BMP characters are not stored in object:
1731
1732 undefChar => qr/[^\0-\x{fffd}]/,
1733
1734=item upper_before_lower
1735
1736-- see 6.6 Case Comparisons, UTS #10.
1737
1738By default, lowercase is before uppercase.
1739If the parameter is made true, this is reversed.
1740
1741B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1742distinctions must occur in level 3, and their weights at level 3 must be
1743same as those mentioned in 7.3.1, UTS #10.
1744If you define your collation elements which differs from this requirement,
1745this parameter doesn't work validly.
1746
1747=item variable
1748
1749-- see 3.6 Variable Weighting, UTS #10.
1750
1751This key allows for variable weighting of variable collation elements,
1752which are marked with an ASTERISK in the table
1753(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
1754
1755 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1756
1757These names are case-insensitive.
1758By default (if specification is omitted), 'shifted' is adopted.
1759
1760 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1761 considered at the 4th level.
1762
1763 'Non-Ignorable' Variable elements are not reset to ignorable.
1764
1765 'Shifted' Variable elements are made ignorable at levels 1 through 3
1766 their level 4 weight is replaced by the old level 1 weight.
1767 Level 4 weight for Non-Variable elements is 0xFFFF.
1768
1769 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1770 are trimmed.
1771
1772=back
1773
1774=head2 Methods for Collation
1775
1776=over 4
1777
1778=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1779
1780Sorts a list of strings.
1781
1782=item C<$result = $Collator-E<gt>cmp($a, $b)>
1783
1784Returns 1 (when C<$a> is greater than C<$b>)
1785or 0 (when C<$a> is equal to C<$b>)
1786or -1 (when C<$a> is less than C<$b>).
1787
1788=item C<$result = $Collator-E<gt>eq($a, $b)>
1789
1790=item C<$result = $Collator-E<gt>ne($a, $b)>
1791
1792=item C<$result = $Collator-E<gt>lt($a, $b)>
1793
1794=item C<$result = $Collator-E<gt>le($a, $b)>
1795
1796=item C<$result = $Collator-E<gt>gt($a, $b)>
1797
1798=item C<$result = $Collator-E<gt>ge($a, $b)>
1799
1800They works like the same name operators as theirs.
1801
1802 eq : whether $a is equal to $b.
1803 ne : whether $a is not equal to $b.
1804 lt : whether $a is less than $b.
1805 le : whether $a is less than $b or equal to $b.
1806 gt : whether $a is greater than $b.
1807 ge : whether $a is greater than $b or equal to $b.
1808
1809=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1810
1811-- see 4.3 Form Sort Key, UTS #10.
1812
1813Returns a sort key.
1814
1815You compare the sort keys using a binary comparison
1816and get the result of the comparison of the strings using UCA.
1817
1818 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1819
1820 is equivalent to
1821
1822 $Collator->cmp($a, $b)
1823
1824=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1825
1826Converts a sorting key into its representation form.
1827If C<UCA_Version> is 8, the output is slightly different.
1828
1829 use Unicode::Collate;
1830 my $c = Unicode::Collate->new();
1831 print $c->viewSortKey("Perl"),"\n";
1832
1833 # output:
1834 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1835 # Level 1 Level 2 Level 3 Level 4
1836
1837=back
1838
1839=head2 Methods for Searching
1840
1841The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1842like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1843but they are not aware of any pattern, but only a literal substring.
1844
1845B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1846for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1847C<subst>, C<gsubst>) is croaked, as the position and the length might
1848differ from those on the specified string.
1849
1850C<rearrange> and C<hangul_terminator> parameters are neglected.
1851C<katakana_before_hiragana> and C<upper_before_lower> don't affect
1852matching and searching, as it doesn't matter whether greater or less.
1853
1854=over 4
1855
1856=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1857
1858=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1859
1860If C<$substring> matches a part of C<$string>, returns
1861the position of the first occurrence of the matching part in scalar context;
1862in list context, returns a two-element list of
1863the position and the length of the matching part.
1864
1865If C<$substring> does not match any part of C<$string>,
1866returns C<-1> in scalar context and
1867an empty list in list context.
1868
1869e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
1870you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
1871
1872 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1873 # (normalization => undef) is REQUIRED.
1874 my $match;
1875 if (my($pos,$len) = $Collator->index($str, $sub)) {
1876 $match = substr($str, $pos, $len);
1877 }
1878
1879and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
1880is primary equal to C<"M>E<uuml>C<SS">.
1881
1882=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1883
1884=item C<($match) = $Collator-E<gt>match($string, $substring)>
1885
1886If C<$substring> matches a part of C<$string>, in scalar context, returns
1887B<a reference to> the first occurrence of the matching part
1888(C<$match_ref> is always true if matches,
1889since every reference is B<true>);
1890in list context, returns the first occurrence of the matching part.
1891
1892If C<$substring> does not match any part of C<$string>,
1893returns C<undef> in scalar context and
1894an empty list in list context.
1895
1896e.g.
1897
1898 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1899 print "matches [$$match_ref].\n";
1900 } else {
1901 print "doesn't match.\n";
1902 }
1903
1904 or
1905
1906 if (($match) = $Collator->match($str, $sub)) { # list context
1907 print "matches [$match].\n";
1908 } else {
1909 print "doesn't match.\n";
1910 }
1911
1912=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1913
1914If C<$substring> matches a part of C<$string>, returns
1915all the matching parts (or matching count in scalar context).
1916
1917If C<$substring> does not match any part of C<$string>,
1918returns an empty list.
1919
1920=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1921
1922If C<$substring> matches a part of C<$string>,
1923the first occurrence of the matching part is replaced by C<$replacement>
1924(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
1925
1926C<$replacement> can be a C<CODEREF>,
1927taking the matching part as an argument,
1928and returning a string to replace the matching part
1929(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1930
1931=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1932
1933If C<$substring> matches a part of C<$string>,
1934all the occurrences of the matching part are replaced by C<$replacement>
1935(C<$string> is modified) and C<$count> is returned.
1936
1937C<$replacement> can be a C<CODEREF>,
1938taking the matching part as an argument,
1939and returning a string to replace the matching part
1940(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1941
1942e.g.
1943
1944 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1945 # (normalization => undef) is REQUIRED.
1946 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
1947 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1948
1949 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
1950 # i.e., all the camels are made bold-faced.
1951
1952 Examples: levels and ignore_level2 - what does camel match?
1953 ---------------------------------------------------------------------------
1954 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l
1955 -----------------------|---------------------------------------------------
1956 1 false | yes yes yes yes yes
1957 2 false | yes yes no yes yes
1958 3 false | yes no no yes yes
1959 4 false | yes no no no yes
1960 -----------------------|---------------------------------------------------
1961 1 true | yes yes yes yes yes
1962 2 true | yes yes yes yes yes
1963 3 true | yes no yes yes yes
1964 4 true | yes no yes no yes
1965 ---------------------------------------------------------------------------
1966 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1967 at any level.
1968
1969=back
1970
1971=head2 Other Methods
1972
1973=over 4
1974
1975=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1976
1977=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1978
1979Changes the value of specified keys and returns the changed part.
1980
1981 $Collator = Unicode::Collate->new(level => 4);
1982
1983 $Collator->eq("perl", "PERL"); # false
1984
1985 %old = $Collator->change(level => 2); # returns (level => 4).
1986
1987 $Collator->eq("perl", "PERL"); # true
1988
1989 $Collator->change(%old); # returns (level => 2).
1990
1991 $Collator->eq("perl", "PERL"); # false
1992
1993Not all C<(key,value)>s are allowed to be changed.
1994See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1995
1996In the scalar context, returns the modified collator
1997(but it is B<not> a clone from the original).
1998
1999 $Collator->change(level => 2)->eq("perl", "PERL"); # true
2000
2001 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
2002
2003 $Collator->change(level => 4)->eq("perl", "PERL"); # false
2004
2005=item C<$version = $Collator-E<gt>version()>
2006
2007Returns the version number (a string) of the Unicode Standard
2008which the C<table> file used by the collator object is based on.
2009If the table does not include a version line (starting with C<@version>),
2010returns C<"unknown">.
2011
2012=item C<UCA_Version()>
2013
2014Returns the revision number of UTS #10 this module consults,
2015that should correspond with the DUCET incorporated.
2016
2017=item C<Base_Unicode_Version()>
2018
2019Returns the version number of UTS #10 this module consults,
2020that should correspond with the DUCET incorporated.
2021
2022=back
2023
2024=head1 EXPORT
2025
2026No method will be exported.
2027
2028=head1 INSTALL
2029
2030Though this module can be used without any C<table> file,
2031to use this module easily, it is recommended to install a table file
2032in the UCA format, by copying it under the directory
2033<a place in @INC>/Unicode/Collate.
2034
2035The most preferable one is "The Default Unicode Collation Element Table"
2036(aka DUCET), available from the Unicode Consortium's website:
2037
2038 http://www.unicode.org/Public/UCA/
2039
2040 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
2041
2042If DUCET is not installed, it is recommended to copy the file
2043from http://www.unicode.org/Public/UCA/latest/allkeys.txt
2044to <a place in @INC>/Unicode/Collate/allkeys.txt
2045manually.
2046
2047=head1 CAVEATS
2048
2049=over 4
2050
2051=item Normalization
2052
2053Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2054module (see L<Unicode::Normalize>).
2055
2056If you need not it (say, in the case when you need not
2057handle any combining characters),
2058assign C<(normalization =E<gt> undef)> explicitly.
2059
2060-- see 6.5 Avoiding Normalization, UTS #10.
2061
2062=item Conformance Test
2063
2064The Conformance Test for the UCA is available
2065under L<http://www.unicode.org/Public/UCA/>.
2066
2067For F<CollationTest_SHIFTED.txt>,
2068a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2069for F<CollationTest_NON_IGNORABLE.txt>, a collator via
2070C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
2071
2072If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
2073C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
2074C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
2075C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
2076
2077B<Unicode::Normalize is required to try The Conformance Test.>
2078
2079=back
2080
2081=head1 AUTHOR, COPYRIGHT AND LICENSE
2082
2083The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
2084<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2016,
2085SADAHIRO Tomoyuki. Japan. All rights reserved.
2086
2087This module is free software; you can redistribute it and/or
2088modify it under the same terms as Perl itself.
2089
2090The file Unicode/Collate/allkeys.txt was copied verbatim
2091from L<http://www.unicode.org/Public/UCA/8.0.0/allkeys.txt>.
2092For this file, Copyright (c) 2001-2015 Unicode, Inc.; distributed
2093under the Terms of Use in L<http://www.unicode.org/terms_of_use.html>
2094
2095=head1 SEE ALSO
2096
2097=over 4
2098
2099=item Unicode Collation Algorithm - UTS #10
2100
2101L<http://www.unicode.org/reports/tr10/>
2102
2103=item The Default Unicode Collation Element Table (DUCET)
2104
2105L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
2106
2107=item The conformance test for the UCA
2108
2109L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
2110
2111L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
2112
2113=item Hangul Syllable Type
2114
2115L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
2116
2117=item Unicode Normalization Forms - UAX #15
2118
2119L<http://www.unicode.org/reports/tr15/>
2120
2121=item Unicode Locale Data Markup Language (LDML) - UTS #35
2122
2123L<http://www.unicode.org/reports/tr35/>
2124
2125=back
2126
2127=cut