1 package Unicode::Collate;
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
17 our $VERSION = '0.92';
18 our $PACKAGE = __PACKAGE__;
22 our @ISA = qw(DynaLoader);
23 bootstrap Unicode::Collate $VERSION;
26 my @Path = qw(Unicode Collate);
27 my $KeyFile = "allkeys.txt";
30 use constant TRUE => 1;
31 use constant FALSE => "";
32 use constant NOMATCHPOS => -1;
34 # A coderef to get combining class imported from Unicode::Normalize
35 # (i.e. \&Unicode::Normalize::getCombinClass).
36 # This is also used as a HAS_UNICODE_NORMALIZE flag.
40 use constant MinLevel => 1;
41 use constant MaxLevel => 4;
43 # Minimum weights at level 2 and 3, respectively
44 use constant Min2Wt => 0x20;
45 use constant Min3Wt => 0x02;
47 # Shifted weight at 4th level
48 use constant Shift4Wt => 0xFFFF;
50 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
51 use constant VCE_TEMPLATE => 'Cn4';
53 # A sort key: 16-bit weights
54 use constant KEY_TEMPLATE => 'n*';
56 # The tie-breaking: 32-bit weights
57 use constant TIE_TEMPLATE => 'N*';
59 # Level separator in a sort key:
60 # i.e. pack(KEY_TEMPLATE, 0)
61 use constant LEVEL_SEP => "\0\0";
63 # As Unicode code point separator for hash keys.
64 # A joined code point string (denoted by JCPS below)
65 # like "65;768" is used for internal processing
66 # instead of Perl's Unicode string like "\x41\x{300}",
67 # as the native code point is different from the Unicode code point
69 # This character must not be included in any stringified
70 # representation of an integer.
71 use constant CODE_SEP => ';';
72 # NOTE: in regex /;/ is used for $jcps!
74 # boolean values of variable weights
75 use constant NON_VAR => 0; # Non-Variable character
76 use constant VAR => 1; # Variable character
78 # specific code points
79 use constant Hangul_SIni => 0xAC00;
80 use constant Hangul_SFin => 0xD7A3;
82 # Logical_Order_Exception in PropList.txt
83 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
85 sub UCA_Version { "24" }
87 sub Base_Unicode_Version { "6.1.0" }
92 return pack('U*', @_);
99 blanked non-ignorable shifted shift-trimmed
100 / } = (); # keys lowercased
103 alternate backwards level normalization rearrange
104 katakana_before_hiragana upper_before_lower ignore_level2
105 overrideHangul overrideCJK preprocess UCA_Version
106 hangul_terminator variable identical
110 entry mapping table maxlength contraction
111 ignoreChar ignoreName undefChar undefName rewrite
112 versionTable alternateTable backwardsTable forwardsTable
113 rearrangeTable variableTable
114 derivCode normCode rearrangeHash backwardsFlag
115 suppress suppressHash
116 __useXS /; ### XS only
117 # The hash key 'ignored' is deleted at v 0.21.
118 # The hash key 'isShift' is deleted at v 0.23.
119 # The hash key 'combining' is deleted at v 0.24.
120 # The hash key 'entries' is deleted at v 0.30.
121 # The hash key 'L3_ignorable' is deleted at v 0.40.
125 return $self->{versionTable} || 'unknown';
128 my (%ChangeOK, %ChangeNG);
129 @ChangeOK{ @ChangeOK } = ();
130 @ChangeNG{ @ChangeNG } = ();
136 if (exists $hash{alternate}) {
137 if (exists $hash{variable}) {
138 delete $hash{alternate};
140 $hash{variable} = $hash{alternate};
143 foreach my $k (keys %hash) {
144 if (exists $ChangeOK{$k}) {
145 $old{$k} = $self->{$k};
146 $self->{$k} = $hash{$k};
147 } elsif (exists $ChangeNG{$k}) {
148 croak "change of $k via change() is not allowed!";
152 $self->checkCollator();
153 return wantarray ? %old : $self;
158 my $key = shift; # 'level' or 'backwards'
159 MinLevel <= $level or croak sprintf
160 "Illegal level %d (in value for key '%s') lower than %d.",
161 $level, $key, MinLevel;
162 $level <= MaxLevel or croak sprintf
163 "Unsupported level %d (in value for key '%s') higher than %d.",
164 $level, $key, MaxLevel;
170 11 => \&_derivCE_9, # 11 == 9
172 16 => \&_derivCE_14, # 16 == 14
177 26 => \&_derivCE_24, # 26 == 24
182 _checkLevel($self->{level}, "level");
184 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
185 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
187 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
188 $self->{alternateTable} || 'shifted';
189 $self->{variable} = $self->{alternate} = lc($self->{variable});
190 exists $VariableOK{ $self->{variable} }
191 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
193 if (! defined $self->{backwards}) {
194 $self->{backwardsFlag} = 0;
195 } elsif (! ref $self->{backwards}) {
196 _checkLevel($self->{backwards}, "backwards");
197 $self->{backwardsFlag} = 1 << $self->{backwards};
200 $self->{backwardsFlag} = 0;
201 for my $b (@{ $self->{backwards} }) {
202 _checkLevel($b, "backwards");
205 for my $v (sort keys %level) {
206 $self->{backwardsFlag} += 1 << $v;
210 defined $self->{rearrange} or $self->{rearrange} = [];
211 ref $self->{rearrange}
212 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
214 # keys of $self->{rearrangeHash} are $self->{rearrange}.
215 $self->{rearrangeHash} = undef;
217 if (@{ $self->{rearrange} }) {
218 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
221 $self->{normCode} = undef;
223 if (defined $self->{normalization}) {
224 eval { require Unicode::Normalize };
225 $@ and croak "Unicode::Normalize is required to normalize strings";
227 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
229 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
230 $self->{normCode} = \&Unicode::Normalize::NFD;
232 elsif ($self->{normalization} ne 'prenormalized') {
233 my $norm = $self->{normalization};
234 $self->{normCode} = sub {
235 Unicode::Normalize::normalize($norm, shift);
237 eval { $self->{normCode}->("") }; # try
238 $@ and croak "$PACKAGE unknown normalization form name: $norm";
247 my $self = bless { @_ }, $class;
249 ### begin XS only ###
250 if (! exists $self->{table} && !defined $self->{rewrite} &&
251 !defined $self->{undefName} && !defined $self->{ignoreName} &&
252 !defined $self->{undefChar} && !defined $self->{ignoreChar}) {
253 $self->{__useXS} = \&_fetch_simple;
255 $self->{__useXS} = undef;
259 # keys of $self->{suppressHash} are $self->{suppress}.
260 if ($self->{suppress} && @{ $self->{suppress} }) {
261 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
262 } # before read_table()
264 # If undef is passed explicitly, no file is read.
265 $self->{table} = $KeyFile if ! exists $self->{table};
266 $self->read_table() if defined $self->{table};
268 if ($self->{entry}) {
269 while ($self->{entry} =~ /([^\n]+)/g) {
270 $self->parseEntry($1, TRUE);
274 $self->{level} ||= MaxLevel;
275 $self->{UCA_Version} ||= UCA_Version();
277 $self->{overrideHangul} = FALSE
278 if ! exists $self->{overrideHangul};
279 $self->{overrideCJK} = FALSE
280 if ! exists $self->{overrideCJK};
281 $self->{normalization} = 'NFD'
282 if ! exists $self->{normalization};
283 $self->{rearrange} = $self->{rearrangeTable} ||
284 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
285 if ! exists $self->{rearrange};
286 $self->{backwards} = $self->{backwardsTable}
287 if ! exists $self->{backwards};
289 $self->checkCollator();
296 my $line = shift; # after s/^\s*\@//
298 if ($line =~ /^version\s*(\S*)/) {
299 $self->{versionTable} ||= $1;
301 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
302 $self->{variableTable} ||= $1;
304 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
305 $self->{alternateTable} ||= $1;
307 elsif ($line =~ /^backwards\s+(\S*)/) {
308 push @{ $self->{backwardsTable} }, $1;
310 elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
311 push @{ $self->{forwardsTable} }, $1;
313 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
314 push @{ $self->{rearrangeTable} }, _getHexArray($1);
321 ### begin XS only ###
322 if ($self->{__useXS}) {
323 my @rest = _fetch_rest(); # complex matter need to parse
324 for my $line (@rest) {
325 next if $line =~ /^\s*#/;
327 if ($line =~ s/^\s*\@//) {
328 $self->parseAtmark($line);
330 $self->parseEntry($line);
338 foreach my $d (@INC) {
339 $f = File::Spec->catfile($d, @Path, $self->{table});
340 last if open($fh, $f);
344 $f = File::Spec->catfile(@Path, $self->{table});
345 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
348 while (my $line = <$fh>) {
349 next if $line =~ /^\s*#/;
351 if ($line =~ s/^\s*\@//) {
352 $self->parseAtmark($line);
354 $self->parseEntry($line);
362 ## get $line, parse it, and write an entry in $self
368 my $tailoring = shift;
369 my($name, $entry, @uv, @key);
371 if (defined $self->{rewrite}) {
372 $line = $self->{rewrite}->($line);
375 return if $line !~ /^\s*[0-9A-Fa-f]/;
377 # removes comment and gets name
379 if $line =~ s/[#%]\s*(.*)//;
380 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
383 my($e, $k) = split /;/, $line;
384 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
387 @uv = _getHexArray($e);
389 return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
390 exists $self->{suppressHash}{$uv[0]};
391 $entry = join(CODE_SEP, @uv); # in JCPS
393 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
394 my $ele = pack_U(@uv);
396 # regarded as if it were not entried in the table
398 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
400 # replaced as completely ignorable
401 $k = '[.0000.0000.0000.0000]'
402 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
405 # replaced as completely ignorable
406 $k = '[.0000.0000.0000.0000]'
407 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
409 my $is_L3_ignorable = TRUE;
411 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
412 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
413 my @wt = _getHexArray($arr);
414 push @key, pack(VCE_TEMPLATE, $var, @wt);
415 $is_L3_ignorable = FALSE
416 if $wt[0] || $wt[1] || $wt[2];
417 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
418 # is completely ignorable.
419 # For expansion, an entry $is_L3_ignorable
420 # if and only if "all" CEs are [.0000.0000.0000].
423 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
426 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
427 $self->{maxlength}{$uv[0]} = @uv;
433 my $fake_entry = join(CODE_SEP, @uv); # in JCPS
434 $self->{contraction}{$fake_entry} = 1;
444 $self->visualizeSortKey($self->getSortKey($str));
452 my $prep = $self->{preprocess};
453 my $norm = $self->{normCode};
455 $str = &$prep($str) if ref $prep;
456 $str = &$norm($str) if ref $norm;
461 ## arrayref of JCPS = splitEnt(string to be collated)
462 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
468 my $wLen = shift; # with Length
470 my $map = $self->{mapping};
471 my $max = $self->{maxlength};
472 my $reH = $self->{rearrangeHash};
473 my $vers = $self->{UCA_Version};
474 my $ver9 = $vers >= 9 && $vers <= 11;
475 my $uXS = $self->{__useXS}; ### XS only
479 # get array of Unicode code point of string.
480 my @src = unpack_U($str);
483 # Character positions are not kept if rearranged,
484 # then neglected if $wLen is true.
485 if ($reH && ! $wLen) {
486 for (my $i = 0; $i < @src; $i++) {
487 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
488 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
494 # remove a code point marked as a completely ignorable.
495 for (my $i = 0; $i < @src; $i++) {
496 if (_isIllegal($src[$i]) || $vers <= 20 && _isNonchar($src[$i])) {
499 $src[$i] = undef if $map->{ $src[$i] } &&
500 @{ $map->{ $src[$i] } } == 0;
501 ### begin XS only ###
503 $src[$i] = undef if _ignorable_simple($src[$i]);
509 for (my $i = 0; $i < @src; $i++) {
512 # skip removed code point
513 if (! defined $jcps) {
515 $buf[-1][2] = $i + 1;
524 my $temp_jcps = $jcps;
526 my $maxLen = $max->{$jcps};
528 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
529 next if ! defined $src[$p];
530 $temp_jcps .= CODE_SEP . $src[$p];
532 if ($map->{$temp_jcps}) {
538 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
539 # This process requires Unicode::Normalize.
540 # If "normalization" is undef, here should be skipped *always*
541 # (in spite of bool value of $CVgetCombinClass),
542 # since canonical ordering cannot be expected.
543 # Blocked combining character should not be contracted.
545 # $self->{normCode} is false in the case of "prenormalized".
546 if ($self->{normalization}) {
547 my $cont = $self->{contraction};
553 for (my $p = $i + 1; $p < @src; $p++) {
554 next if ! defined $src[$p];
555 my $curCC = $CVgetCombinClass->($src[$p]);
557 my $tail = CODE_SEP . $src[$p];
559 if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
560 $cont->{$jcps_uc.$tail})) {
567 if ($preCC != $curCC && $map->{$jcps.$tail}) {
575 if ($map->{$jcps_uc}) {
577 $src[$_] = undef for @out_uc;
579 $src[$_] = undef for @out;
584 # skip completely ignorable
585 if ($uXS && $jcps !~ /;/ && _ignorable_simple($jcps) || ### XS only
586 $map->{$jcps} && @{ $map->{$jcps} } == 0) {
588 $buf[-1][2] = $i + 1;
593 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
599 ## VCE = _pack_override(input, codepoint, derivCode)
601 sub _pack_override ($$$) {
607 return pack(VCE_TEMPLATE, NON_VAR, @$r);
608 } elsif (defined $r) {
609 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
616 ## list of VCE = getWt(JCPS)
622 my $map = $self->{mapping};
623 my $der = $self->{derivCode};
624 my $uXS = $self->{__useXS}; ### XS only
626 return if !defined $u;
627 return map($self->varCE($_), @{ $map->{$u} })
629 ### begin XS only ###
630 return map($self->varCE($_), _fetch_simple($u))
631 if $uXS && _exists_simple($u);
634 # JCPS must not be a contraction, then it's a code point.
635 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
636 my $hang = $self->{overrideHangul};
639 @hangulCE = map _pack_override($_, $u, $der), $hang->($u);
640 } elsif (!defined $hang) {
641 @hangulCE = $der->($u);
643 my $max = $self->{maxlength};
644 my @decH = _decompHangul($u);
647 my $contract = join(CODE_SEP, @decH);
648 @decH = ($contract) if $map->{$contract};
649 } else { # must be <@decH == 3>
650 if ($max->{$decH[0]}) {
651 my $contract = join(CODE_SEP, @decH);
652 if ($map->{$contract}) {
655 $contract = join(CODE_SEP, @decH[0,1]);
656 $map->{$contract} and @decH = ($contract, $decH[2]);
658 # even if V's ignorable, LT contraction is not supported.
659 # If such a situation were required, NFD should be used.
661 if (@decH == 3 && $max->{$decH[1]}) {
662 my $contract = join(CODE_SEP, @decH[1,2]);
663 $map->{$contract} and @decH = ($decH[0], $contract);
668 $map->{$_} ? @{ $map->{$_} } :
669 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
673 return map $self->varCE($_), @hangulCE;
675 my $cjk = $self->{overrideCJK};
676 my $vers = $self->{UCA_Version};
677 if ($cjk && _isUIdeo($u, $vers)) {
678 my @cjkCE = map _pack_override($_, $u, $der), $cjk->($u);
679 return map $self->varCE($_), @cjkCE;
681 if ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
682 return map $self->varCE($_), _uideoCE_8($u);
684 return map $self->varCE($_), $der->($u);
690 ## string sortkey = getSortKey(string arg)
696 my $str = $self->process($orig);
697 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
698 my $vers = $self->{UCA_Version};
699 my $term = $self->{hangul_terminator};
700 my $lev = $self->{level};
701 my $iden = $self->{identical};
703 my @buf; # weight arrays
706 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
707 foreach my $jcps (@$rEnt) {
708 # weird things like VL, TL-contraction are not considered!
709 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
710 if ($preHST && !$curHST || # hangul before non-hangul
711 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
712 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
713 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
717 push @buf, $self->getWt($jcps);
719 push @buf, $termCE if $preHST; # end at hangul
721 foreach my $jcps (@$rEnt) {
722 push @buf, $self->getWt($jcps);
726 my $rkey = $self->mk_SortKey(\@buf); ### XS only
728 if ($iden || $vers >= 26 && $lev == MaxLevel) {
730 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
737 ## int compare = cmp(string a, string b)
739 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
740 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
741 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
742 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
743 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
744 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
745 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
748 ## list[strings] sorted = sort(list[strings] arg)
754 sort{ $a->[0] cmp $b->[0] }
755 map [ $obj->getSortKey($_), $_ ], @_;
760 ## bool _nonIgnorAtLevel(arrayref weights, int level)
762 sub _nonIgnorAtLevel($$)
765 return if ! defined $wt;
767 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
772 ## arrayref of arrayref[weights] source,
773 ## arrayref of arrayref[weights] substr,
775 ## * comparison of graphemes vs graphemes.
776 ## @$source >= @$substr must be true (check it before call this);
784 for my $g (0..@$substr-1){
785 # Do the $g'th graphemes have the same number of AV weights?
786 return if @{ $source->[$g] } != @{ $substr->[$g] };
788 for my $w (0..@{ $substr->[$g] }-1) {
789 for my $v (0..$lev-1) {
790 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
798 ## (int position, int length)
799 ## int position = index(string, substring, position, [undoc'ed global])
801 ## With "global" (only for the list context),
802 ## returns list of arrayref[position, length].
807 $self->{preprocess} and
808 croak "Don't use Preprocess with index(), match(), etc.";
809 $self->{normCode} and
810 croak "Don't use Normalization with index(), match(), etc.";
813 my $len = length($str);
815 my $subE = $self->splitEnt($sub);
816 my $pos = @_ ? shift : 0;
817 $pos = 0 if $pos < 0;
820 my $lev = $self->{level};
821 my $v2i = $self->{UCA_Version} >= 9 &&
822 $self->{variable} ne 'non-ignorable';
825 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
827 ? map([$_, 0], $temp..$len)
828 : wantarray ? ($temp,0) : $temp;
831 and return wantarray ? () : NOMATCHPOS;
832 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
834 or return wantarray ? () : NOMATCHPOS;
836 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
838 my $last_is_variable;
839 for my $vwt (map $self->getWt($_), @$subE) {
840 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
841 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
843 # "Ignorable (L1, L2) after Variable" since track. v. 9
846 $last_is_variable = TRUE;
848 elsif (!$wt[0]) { # ignorable
849 $to_be_pushed = FALSE if $last_is_variable;
852 $last_is_variable = FALSE;
856 if (@subWt && !$var && !$wt[0]) {
857 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
858 } elsif ($to_be_pushed) {
859 push @subWt, [ \@wt ];
865 my $end = @$strE - 1;
867 $last_is_variable = FALSE; # reuse
868 for (my $i = 0; $i <= $end; ) { # no $i++
872 while ($i <= $end && $found_base == 0) {
873 for my $vwt ($self->getWt($strE->[$i][0])) {
874 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
875 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
877 # "Ignorable (L1, L2) after Variable" since track. v. 9
880 $last_is_variable = TRUE;
882 elsif (!$wt[0]) { # ignorable
883 $to_be_pushed = FALSE if $last_is_variable;
886 $last_is_variable = FALSE;
890 if (@strWt && !$var && !$wt[0]) {
891 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
892 $finPos[-1] = $strE->[$i][2];
893 } elsif ($to_be_pushed) {
894 push @strWt, [ \@wt ];
895 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
896 $finPos[-1] = NOMATCHPOS if $found_base;
897 push @finPos, $strE->[$i][2];
906 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
907 if ($iniPos[0] != NOMATCHPOS &&
908 $finPos[$#subWt] != NOMATCHPOS &&
909 _eqArray(\@strWt, \@subWt, $lev)) {
910 my $temp = $iniPos[0] + $pos;
913 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
914 splice @strWt, 0, $#subWt;
915 splice @iniPos, 0, $#subWt;
916 splice @finPos, 0, $#subWt;
920 ? ($temp, $finPos[$#subWt] - $iniPos[0])
932 : wantarray ? () : NOMATCHPOS;
936 ## scalarref to matching part = match(string, substring)
941 if (my($pos,$len) = $self->index($_[0], $_[1])) {
942 my $temp = substr($_[0], $pos, $len);
943 return wantarray ? $temp : \$temp;
944 # An lvalue ref \substr should be avoided,
945 # since its value is affected by modification of its referent.
953 ## arrayref matching parts = gmatch(string, substring)
960 return map substr($str, $_->[0], $_->[1]),
961 $self->index($str, $sub, 0, 'g');
965 ## bool subst'ed = subst(string, substring, replace)
970 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
972 if (my($pos,$len) = $self->index($_[0], $_[1])) {
974 my $mat = substr($_[0], $pos, $len);
975 substr($_[0], $pos, $len, $code->($mat));
977 substr($_[0], $pos, $len, $_[2]);
987 ## int count = gsubst(string, substring, replace)
992 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
995 # Replacement is carried out from the end, then use reverse.
996 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
998 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
999 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1001 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1013 Unicode::Collate - Unicode Collation Algorithm
1017 use Unicode::Collate;
1020 $Collator = Unicode::Collate->new(%tailoring);
1023 @sorted = $Collator->sort(@not_sorted);
1026 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1028 B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1029 according to Perl's Unicode support. See L<perlunicode>,
1030 L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1031 Otherwise you can use C<preprocess> or should decode them before.
1035 This module is an implementation of Unicode Technical Standard #10
1036 (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1038 =head2 Constructor and Tailoring
1040 The C<new> method returns a collator object. If new() is called
1041 with no parameters, the collator should do the default collation.
1043 $Collator = Unicode::Collate->new(
1044 UCA_Version => $UCA_Version,
1045 alternate => $alternate, # alias for 'variable'
1046 backwards => $levelNumber, # or \@levelNumbers
1048 hangul_terminator => $term_primary_weight,
1050 ignoreName => qr/$ignoreName/,
1051 ignoreChar => qr/$ignoreChar/,
1052 ignore_level2 => $bool,
1053 katakana_before_hiragana => $bool,
1054 level => $collationLevel,
1055 normalization => $normalization_form,
1056 overrideCJK => \&overrideCJK,
1057 overrideHangul => \&overrideHangul,
1058 preprocess => \&preprocess,
1059 rearrange => \@charList,
1060 rewrite => \&rewrite,
1061 suppress => \@charList,
1063 undefName => qr/$undefName/,
1064 undefChar => qr/$undefChar/,
1065 upper_before_lower => $bool,
1066 variable => $variable,
1073 If the revision (previously "tracking version") number of UCA is given,
1074 behavior of that revision is emulated on collating.
1075 If omitted, the return value of C<UCA_Version()> is used.
1077 The following revisions are supported. The default is 24.
1079 UCA Unicode Standard DUCET (@version)
1080 -------------------------------------------------------
1081 8 3.1 3.0.1 (3.0.1d9)
1082 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1083 11 4.0 4.0.0 (4.0.0)
1084 14 4.1.0 4.1.0 (4.1.0)
1085 16 5.0 5.0.0 (5.0.0)
1086 18 5.1.0 5.1.0 (5.1.0)
1087 20 5.2.0 5.2.0 (5.2.0)
1088 22 6.0.0 6.0.0 (6.0.0)
1089 24 6.1.0 6.1.0 (6.1.0)
1090 26 6.2.0 6.2.0 (6.2.0)
1092 * Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1093 since C<UCA_Version> 22.
1095 * Fully ignorable characters were ignored, and would not interrupt
1096 contractions with C<UCA_Version> 9 and 11.
1098 * Treatment of ignorables after variables and some behaviors
1099 were changed at C<UCA_Version> 9.
1101 * Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1102 depend on C<UCA_Version>.
1104 * Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1105 C<hangul_terminator>.
1109 -- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1111 For backward compatibility, C<alternate> (old name) can be used
1112 as an alias for C<variable>.
1116 -- see 3.4 Backward Accents, UTS #10.
1118 backwards => $levelNumber or \@levelNumbers
1120 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1121 If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1122 forwards at all the levels.
1126 -- see 5 Tailoring; 3.6.1 File Format, UTS #10.
1128 If the same character (or a sequence of characters) exists
1129 in the collation element table through C<table>,
1130 mapping to collation elements is overridden.
1131 If it does not exist, the mapping is defined additionally.
1133 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1134 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1135 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1136 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1137 006C 006C ; [.0F4C.0020.0002.006C] # ll
1138 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1139 004C 004C ; [.0F4C.0020.0008.004C] # LL
1140 00F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1141 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1142 00D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1143 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1146 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1147 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1148 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1151 B<NOTE:> The code point in the UCA file format (before C<';'>)
1152 B<must> be a Unicode code point (defined as hexadecimal),
1153 but not a native code point.
1154 So C<0063> must always denote C<U+0063>,
1155 but not a character of C<"\x63">.
1157 Weighting may vary depending on collation element table.
1158 So ensure the weights defined in C<entry> will be consistent with
1159 those in the collation element table loaded via C<table>.
1161 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1162 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1163 (as a value between C<0E60> and C<0E6D>)
1164 makes ordering as C<C E<lt> CH E<lt> D>.
1165 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1166 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1167 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1168 and C<c-curl> (C<U+0255>) with C<0E69>.
1169 Then primary weight C<0E6A> for C<CH> makes C<CH>
1170 ordered between C<c-curl> and C<D>.
1172 =item hangul_terminator
1174 -- see 7.1.4 Trailing Weights, UTS #10.
1176 If a true value is given (non-zero but should be positive),
1177 it will be added as a terminator primary weight to the end of
1178 every standard Hangul syllable. Secondary and any higher weights
1179 for terminator are set to zero.
1180 If the value is false or C<hangul_terminator> key does not exist,
1181 insertion of terminator weights will not be performed.
1183 Boundaries of Hangul syllables are determined
1184 according to conjoining Jamo behavior in F<the Unicode Standard>
1185 and F<HangulSyllableType.txt>.
1187 B<Implementation Note:>
1188 (1) For expansion mapping (Unicode character mapped
1189 to a sequence of collation elements), a terminator will not be added
1190 between collation elements, even if Hangul syllable boundary exists there.
1191 Addition of terminator is restricted to the next position
1192 to the last collation element.
1194 (2) Non-conjoining Hangul letters
1195 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1196 automatically terminated with a terminator primary weight.
1197 These characters may need terminator included in a collation element
1202 -- see A.3 Deterministic Comparison, UTS #10.
1204 By default, strings whose weights are equal should be equal,
1205 even though their code points are not equal.
1207 If the parameter is made true, a final, tie-breaking level is used.
1208 If no difference of weights is found after the comparison through all
1209 the level (independent of the value of C<level>), the comparison with
1210 code points will be performed. For the tie-breaking comparision,
1211 the sort key has code points of the original string appended.
1213 If C<preprocess> and/or C<normalization> is applied, the code points
1214 of the string after them (in NFD by default) are used.
1220 -- see 3.6.2 Variable Weighting, UTS #10.
1222 Makes the entry in the table completely ignorable;
1223 i.e. as if the weights were zero at all level.
1225 Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1226 will be ignored. Through C<ignoreName>, any character whose name
1227 (given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1230 E.g. when 'a' and 'e' are ignorable,
1231 'element' is equal to 'lament' (or 'lmnt').
1235 -- see 5.1 Parametric Tailoring, UTS #10.
1237 By default, case-sensitive comparison (that is level 3 difference)
1238 won't ignore accents (that is level 2 difference).
1240 If the parameter is made true, accents (and other primary ignorable
1241 characters) are ignored, even though cases are taken into account.
1243 B<NOTE>: C<level> should be 3 or greater.
1245 =item katakana_before_hiragana
1247 -- see 7.2 Tertiary Weight Table, UTS #10.
1249 By default, hiragana is before katakana.
1250 If the parameter is made true, this is reversed.
1252 B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1253 distinctions must occur in level 3, and their weights at level 3 must be
1254 same as those mentioned in 7.3.1, UTS #10.
1255 If you define your collation elements which violate this requirement,
1256 this parameter does not work validly.
1260 -- see 4.3 Form Sort Key, UTS #10.
1262 Set the maximum level.
1263 Any higher levels than the specified one are ignored.
1265 Level 1: alphabetic ordering
1266 Level 2: diacritic ordering
1267 Level 3: case ordering
1268 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1272 If omitted, the maximum is the 4th.
1274 B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1275 But this module only uses weights within 0xFFFF.
1276 When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1277 and 'shift-trimmed'), the level 4 may be unreliable.
1279 See also C<identical>.
1283 -- see 4.1 Normalize, UTS #10.
1285 If specified, strings are normalized before preparation of sort keys
1286 (the normalization is executed after preprocess).
1288 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1289 as C<$normalization_form>.
1290 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1291 See C<Unicode::Normalize::normalize()> for detail.
1292 If omitted, C<'NFD'> is used.
1294 C<normalization> is performed after C<preprocess> (if defined).
1296 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1297 though they are not concerned with C<Unicode::Normalize::normalize()>.
1299 If C<undef> (not a string C<"undef">) is passed explicitly
1300 as the value for this key,
1301 any normalization is not carried out (this may make tailoring easier
1302 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1303 only contiguous contractions are resolved;
1304 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1305 C<A-cedilla-ring> would be primary equal to C<A>.
1307 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1308 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1310 In the case of C<(normalization =E<gt> "prenormalized")>,
1311 any normalization is not performed, but
1312 discontiguous contractions with combining characters are performed.
1314 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1315 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1316 If source strings are finely prenormalized,
1317 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1319 Except C<(normalization =E<gt> undef)>,
1320 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1324 -- see 7.1 Derived Collation Elements, UTS #10.
1326 By default, CJK unified ideographs are ordered in Unicode codepoint
1327 order, but those in the CJK Unified Ideographs block are lesser than
1328 those in the CJK Unified Ideographs Extension A etc.
1330 In the CJK Unified Ideographs block:
1331 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1332 U+4E00..U+9FBB if UCA_Version is 14 or 16.
1333 U+4E00..U+9FC3 if UCA_Version is 18.
1334 U+4E00..U+9FCB if UCA_Version is 20 or 22.
1335 U+4E00..U+9FCC if UCA_Version is 24 or 26.
1337 In the CJK Unified Ideographs Extension blocks:
1338 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
1339 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or greater.
1340 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or greater.
1342 Through C<overrideCJK>, ordering of CJK unified ideographs (including
1343 extensions) can be overridden.
1345 ex. CJK unified ideographs in the JIS code point order.
1347 overrideCJK => sub {
1348 my $u = shift; # get a Unicode codepoint
1349 my $b = pack('n', $u); # to UTF-16BE
1350 my $s = your_unicode_to_sjis_converter($b); # convert
1351 my $n = unpack('n', $s); # convert sjis to short
1352 [ $n, 0x20, 0x2, $u ]; # return the collation element
1355 The return value may be an arrayref of 1st to 4th weights as shown
1356 above. The return value may be an integer as the primary weight
1357 as shown below. If C<undef> is returned, the default derived
1358 collation element will be used.
1360 overrideCJK => sub {
1361 my $u = shift; # get a Unicode codepoint
1362 my $b = pack('n', $u); # to UTF-16BE
1363 my $s = your_unicode_to_sjis_converter($b); # convert
1364 my $n = unpack('n', $s); # convert sjis to short
1365 return $n; # return the primary weight
1368 The return value may be a list containing zero or more of
1369 an arrayref, an integer, or C<undef>.
1371 ex. ignores all CJK unified ideographs.
1373 overrideCJK => sub {()}, # CODEREF returning empty list
1375 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1376 # as U+4E00 is a CJK unified ideograph and to be ignorable.
1378 If C<undef> is passed explicitly as the value for this key,
1379 weights for CJK unified ideographs are treated as undefined.
1380 But assignment of weight for CJK unified ideographs
1381 in C<table> or C<entry> is still valid.
1383 B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1384 C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1385 C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1386 ideographs. But they can't be overridden via C<overrideCJK> when you use
1387 DUCET, as the table includes weights for them. C<table> or C<entry> has
1388 priority over C<overrideCJK>.
1390 =item overrideHangul
1392 -- see 7.1 Derived Collation Elements, UTS #10.
1394 By default, Hangul syllables are decomposed into Hangul Jamo,
1395 even if C<(normalization =E<gt> undef)>.
1396 But the mapping of Hangul syllables may be overridden.
1398 This parameter works like C<overrideCJK>, so see there for examples.
1400 If you want to override the mapping of Hangul syllables,
1401 NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1402 Hangul syllables before overriding. FCD may decompose Hangul syllables
1405 If C<undef> is passed explicitly as the value for this key,
1406 weight for Hangul syllables is treated as undefined
1407 without decomposition into Hangul Jamo.
1408 But definition of weight for Hangul syllables
1409 in C<table> or C<entry> is still valid.
1413 -- see 5.4 Preprocessing, UTS #10.
1415 If specified, the coderef is used to preprocess each string
1416 before the formation of sort keys.
1418 ex. dropping English articles, such as "a" or "the".
1419 Then, "the pen" is before "a pencil".
1423 $str =~ s/\b(?:an?|the)\s+//gi;
1427 C<preprocess> is performed before C<normalization> (if defined).
1429 ex. decoding strings in a legacy encoding such as shift-jis:
1431 $sjis_collator = Unicode::Collate->new(
1432 preprocess => \&your_shiftjis_to_unicode_decoder,
1434 @result = $sjis_collator->sort(@shiftjis_strings);
1436 B<Note:> Strings returned from the coderef will be interpreted
1437 according to Perl's Unicode support. See L<perlunicode>,
1438 L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1442 -- see 3.5 Rearrangement, UTS #10.
1444 Characters that are not coded in logical order and to be rearranged.
1445 If C<UCA_Version> is equal to or lesser than 11, default is:
1447 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1449 If you want to disallow any rearrangement, pass C<undef> or C<[]>
1450 (a reference to empty list) as the value for this key.
1452 If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1453 (i.e. no rearrangement).
1455 B<According to the version 9 of UCA, this parameter shall not be used;
1456 but it is not warned at present.>
1460 If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1461 The coderef will get each line, and then should return a rewritten line
1462 according to the UCA file format.
1463 If the coderef returns an empty line, the line will be skipped.
1465 e.g. any primary ignorable characters into tertiary ignorable:
1469 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1473 This example shows rewriting weights. C<rewrite> is allowed to
1474 affect code points, weights, and the name.
1476 B<NOTE>: C<table> is available to use another table file;
1477 preparing a modified table once would be more efficient than
1478 rewriting lines on reading an unmodified table every time.
1482 -- see suppress contractions in 5.14.11 Special-Purpose Commands,
1485 Contractions beginning with the specified characters are suppressed,
1486 even if those contractions are defined in C<table>.
1488 An example for Russian and some languages using the Cyrillic script:
1490 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1492 where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1494 B<NOTE>: Contractions via C<entry> are not be suppressed.
1498 -- see 3.6 Default Unicode Collation Element Table, UTS #10.
1500 You can use another collation element table if desired.
1502 The table file should locate in the F<Unicode/Collate> directory
1503 on C<@INC>. Say, if the filename is F<Foo.txt>,
1504 the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1506 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1507 If you will prepare your own table file, any name other than F<allkeys.txt>
1508 may be better to avoid namespace conflict.
1510 B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1511 module, and it may save time at the run time.
1512 Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
1513 or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1514 C<rewrite> will prevent this module from using the compiled DUCET.
1516 If C<undef> is passed explicitly as the value for this key,
1517 no file is read (but you can define collation elements via C<entry>).
1519 A typical way to define a collation element table
1520 without any file of table:
1522 $onlyABC = Unicode::Collate->new(
1524 entry => << 'ENTRIES',
1525 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1526 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1527 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1528 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1529 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1530 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1534 If C<ignoreName> or C<undefName> is used, character names should be
1535 specified as a comment (following C<#>) on each line.
1541 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1543 Undefines the collation element as if it were unassigned in the C<table>.
1544 This reduces the size of the table.
1545 If an unassigned character appears in the string to be collated,
1546 the sort key is made from its codepoint
1547 as a single-character collation element,
1548 as it is greater than any other assigned collation elements
1549 (in the codepoint order among the unassigned characters).
1550 But, it'd be better to ignore characters
1551 unfamiliar to you and maybe never used.
1553 Through C<undefChar>, any character matching C<qr/$undefChar/>
1554 will be undefined. Through C<undefName>, any character whose name
1555 (given in the C<table> file as a comment) matches C<qr/$undefName/>
1558 ex. Collation weights for beyond-BMP characters are not stored in object:
1560 undefChar => qr/[^\0-\x{fffd}]/,
1562 =item upper_before_lower
1564 -- see 6.6 Case Comparisons, UTS #10.
1566 By default, lowercase is before uppercase.
1567 If the parameter is made true, this is reversed.
1569 B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1570 distinctions must occur in level 3, and their weights at level 3 must be
1571 same as those mentioned in 7.3.1, UTS #10.
1572 If you define your collation elements which differs from this requirement,
1573 this parameter doesn't work validly.
1577 -- see 3.6.2 Variable Weighting, UTS #10.
1579 This key allows for variable weighting of variable collation elements,
1580 which are marked with an ASTERISK in the table
1581 (NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
1583 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1585 These names are case-insensitive.
1586 By default (if specification is omitted), 'shifted' is adopted.
1588 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1589 considered at the 4th level.
1591 'Non-Ignorable' Variable elements are not reset to ignorable.
1593 'Shifted' Variable elements are made ignorable at levels 1 through 3
1594 their level 4 weight is replaced by the old level 1 weight.
1595 Level 4 weight for Non-Variable elements is 0xFFFF.
1597 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1602 =head2 Methods for Collation
1606 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1608 Sorts a list of strings.
1610 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1612 Returns 1 (when C<$a> is greater than C<$b>)
1613 or 0 (when C<$a> is equal to C<$b>)
1614 or -1 (when C<$a> is lesser than C<$b>).
1616 =item C<$result = $Collator-E<gt>eq($a, $b)>
1618 =item C<$result = $Collator-E<gt>ne($a, $b)>
1620 =item C<$result = $Collator-E<gt>lt($a, $b)>
1622 =item C<$result = $Collator-E<gt>le($a, $b)>
1624 =item C<$result = $Collator-E<gt>gt($a, $b)>
1626 =item C<$result = $Collator-E<gt>ge($a, $b)>
1628 They works like the same name operators as theirs.
1630 eq : whether $a is equal to $b.
1631 ne : whether $a is not equal to $b.
1632 lt : whether $a is lesser than $b.
1633 le : whether $a is lesser than $b or equal to $b.
1634 gt : whether $a is greater than $b.
1635 ge : whether $a is greater than $b or equal to $b.
1637 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1639 -- see 4.3 Form Sort Key, UTS #10.
1643 You compare the sort keys using a binary comparison
1644 and get the result of the comparison of the strings using UCA.
1646 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1650 $Collator->cmp($a, $b)
1652 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1654 Converts a sorting key into its representation form.
1655 If C<UCA_Version> is 8, the output is slightly different.
1657 use Unicode::Collate;
1658 my $c = Unicode::Collate->new();
1659 print $c->viewSortKey("Perl"),"\n";
1662 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1663 # Level 1 Level 2 Level 3 Level 4
1667 =head2 Methods for Searching
1669 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1670 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1671 but they are not aware of any pattern, but only a literal substring.
1673 B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1674 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1675 C<subst>, C<gsubst>) is croaked, as the position and the length might
1676 differ from those on the specified string.
1678 C<rearrange> and C<hangul_terminator> parameters are neglected.
1679 C<katakana_before_hiragana> and C<upper_before_lower> don't affect
1680 matching and searching, as it doesn't matter whether greater or lesser.
1684 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1686 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1688 If C<$substring> matches a part of C<$string>, returns
1689 the position of the first occurrence of the matching part in scalar context;
1690 in list context, returns a two-element list of
1691 the position and the length of the matching part.
1693 If C<$substring> does not match any part of C<$string>,
1694 returns C<-1> in scalar context and
1695 an empty list in list context.
1699 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1700 # (normalization => undef) is REQUIRED.
1701 my $str = "Ich muß studieren Perl.";
1704 if (my($pos,$len) = $Collator->index($str, $sub)) {
1705 $match = substr($str, $pos, $len);
1708 and get C<"muß"> in C<$match> since C<"muß">
1709 is primary equal to C<"MÜSS">.
1711 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1713 =item C<($match) = $Collator-E<gt>match($string, $substring)>
1715 If C<$substring> matches a part of C<$string>, in scalar context, returns
1716 B<a reference to> the first occurrence of the matching part
1717 (C<$match_ref> is always true if matches,
1718 since every reference is B<true>);
1719 in list context, returns the first occurrence of the matching part.
1721 If C<$substring> does not match any part of C<$string>,
1722 returns C<undef> in scalar context and
1723 an empty list in list context.
1727 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1728 print "matches [$$match_ref].\n";
1730 print "doesn't match.\n";
1735 if (($match) = $Collator->match($str, $sub)) { # list context
1736 print "matches [$match].\n";
1738 print "doesn't match.\n";
1741 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1743 If C<$substring> matches a part of C<$string>, returns
1744 all the matching parts (or matching count in scalar context).
1746 If C<$substring> does not match any part of C<$string>,
1747 returns an empty list.
1749 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1751 If C<$substring> matches a part of C<$string>,
1752 the first occurrence of the matching part is replaced by C<$replacement>
1753 (C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
1755 C<$replacement> can be a C<CODEREF>,
1756 taking the matching part as an argument,
1757 and returning a string to replace the matching part
1758 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1760 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1762 If C<$substring> matches a part of C<$string>,
1763 all the occurrences of the matching part are replaced by C<$replacement>
1764 (C<$string> is modified) and C<$count> is returned.
1766 C<$replacement> can be a C<CODEREF>,
1767 taking the matching part as an argument,
1768 and returning a string to replace the matching part
1769 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1773 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1774 # (normalization => undef) is REQUIRED.
1775 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
1776 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1778 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
1779 # i.e., all the camels are made bold-faced.
1781 Examples: levels and ignore_level2 - what does camel match?
1782 ---------------------------------------------------------------------------
1783 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l
1784 -----------------------|---------------------------------------------------
1785 1 false | yes yes yes yes yes
1786 2 false | yes yes no yes yes
1787 3 false | yes no no yes yes
1788 4 false | yes no no no yes
1789 -----------------------|---------------------------------------------------
1790 1 true | yes yes yes yes yes
1791 2 true | yes yes yes yes yes
1792 3 true | yes no yes yes yes
1793 4 true | yes no yes no yes
1794 ---------------------------------------------------------------------------
1795 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1800 =head2 Other Methods
1804 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1806 =item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1808 Changes the value of specified keys and returns the changed part.
1810 $Collator = Unicode::Collate->new(level => 4);
1812 $Collator->eq("perl", "PERL"); # false
1814 %old = $Collator->change(level => 2); # returns (level => 4).
1816 $Collator->eq("perl", "PERL"); # true
1818 $Collator->change(%old); # returns (level => 2).
1820 $Collator->eq("perl", "PERL"); # false
1822 Not all C<(key,value)>s are allowed to be changed.
1823 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1825 In the scalar context, returns the modified collator
1826 (but it is B<not> a clone from the original).
1828 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1830 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1832 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1834 =item C<$version = $Collator-E<gt>version()>
1836 Returns the version number (a string) of the Unicode Standard
1837 which the C<table> file used by the collator object is based on.
1838 If the table does not include a version line (starting with C<@version>),
1839 returns C<"unknown">.
1841 =item C<UCA_Version()>
1843 Returns the revision number of UTS #10 this module consults,
1844 that should correspond with the DUCET incorporated.
1846 =item C<Base_Unicode_Version()>
1848 Returns the version number of UTS #10 this module consults,
1849 that should correspond with the DUCET incorporated.
1855 No method will be exported.
1859 Though this module can be used without any C<table> file,
1860 to use this module easily, it is recommended to install a table file
1861 in the UCA format, by copying it under the directory
1862 <a place in @INC>/Unicode/Collate.
1864 The most preferable one is "The Default Unicode Collation Element Table"
1865 (aka DUCET), available from the Unicode Consortium's website:
1867 http://www.unicode.org/Public/UCA/
1869 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1871 If DUCET is not installed, it is recommended to copy the file
1872 from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1873 to <a place in @INC>/Unicode/Collate/allkeys.txt
1882 Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1883 module (see L<Unicode::Normalize>).
1885 If you need not it (say, in the case when you need not
1886 handle any combining characters),
1887 assign C<normalization =E<gt> undef> explicitly.
1889 -- see 6.5 Avoiding Normalization, UTS #10.
1891 =item Conformance Test
1893 The Conformance Test for the UCA is available
1894 under L<http://www.unicode.org/Public/UCA/>.
1896 For F<CollationTest_SHIFTED.txt>,
1897 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1898 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1899 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1901 If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
1902 C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
1903 C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
1904 C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
1906 B<Unicode::Normalize is required to try The Conformance Test.>
1910 =head1 AUTHOR, COPYRIGHT AND LICENSE
1912 The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1913 <SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2012,
1914 SADAHIRO Tomoyuki. Japan. All rights reserved.
1916 This module is free software; you can redistribute it and/or
1917 modify it under the same terms as Perl itself.
1919 The file Unicode/Collate/allkeys.txt was copied verbatim
1920 from L<http://www.unicode.org/Public/UCA/6.1.0/allkeys.txt>.
1921 For this file, Copyright (c) 2001-2011 Unicode, Inc.
1922 Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
1928 =item Unicode Collation Algorithm - UTS #10
1930 L<http://www.unicode.org/reports/tr10/>
1932 =item The Default Unicode Collation Element Table (DUCET)
1934 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1936 =item The conformance test for the UCA
1938 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1940 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1942 =item Hangul Syllable Type
1944 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1946 =item Unicode Normalization Forms - UAX #15
1948 L<http://www.unicode.org/reports/tr15/>
1950 =item Unicode Locale Data Markup Language (LDML) - UTS #35
1952 L<http://www.unicode.org/reports/tr35/>