1 package Unicode::Collate;
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
17 our $VERSION = '0.89';
18 our $PACKAGE = __PACKAGE__;
22 our @ISA = qw(DynaLoader);
23 bootstrap Unicode::Collate $VERSION;
26 my @Path = qw(Unicode Collate);
27 my $KeyFile = "allkeys.txt";
30 use constant TRUE => 1;
31 use constant FALSE => "";
32 use constant NOMATCHPOS => -1;
34 # A coderef to get combining class imported from Unicode::Normalize
35 # (i.e. \&Unicode::Normalize::getCombinClass).
36 # This is also used as a HAS_UNICODE_NORMALIZE flag.
40 use constant MinLevel => 1;
41 use constant MaxLevel => 4;
43 # Minimum weights at level 2 and 3, respectively
44 use constant Min2Wt => 0x20;
45 use constant Min3Wt => 0x02;
47 # Shifted weight at 4th level
48 use constant Shift4Wt => 0xFFFF;
50 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
51 # PROBLEM: The Default Unicode Collation Element Table
52 # has weights over 0xFFFF at the 4th level.
53 # The tie-breaking in the variable weights
54 # other than "shift" (as well as "shift-trimmed") is unreliable.
55 use constant VCE_TEMPLATE => 'Cn4';
57 # A sort key: 16-bit weights
58 # See also the PROBLEM on VCE_TEMPLATE above.
59 use constant KEY_TEMPLATE => 'n*';
61 # Level separator in a sort key:
62 # i.e. pack(KEY_TEMPLATE, 0)
63 use constant LEVEL_SEP => "\0\0";
65 # As Unicode code point separator for hash keys.
66 # A joined code point string (denoted by JCPS below)
67 # like "65;768" is used for internal processing
68 # instead of Perl's Unicode string like "\x41\x{300}",
69 # as the native code point is different from the Unicode code point
71 # This character must not be included in any stringified
72 # representation of an integer.
73 use constant CODE_SEP => ';';
74 # NOTE: in regex /;/ is used for $jcps!
76 # boolean values of variable weights
77 use constant NON_VAR => 0; # Non-Variable character
78 use constant VAR => 1; # Variable character
80 # specific code points
81 use constant Hangul_SIni => 0xAC00;
82 use constant Hangul_SFin => 0xD7A3;
84 # Logical_Order_Exception in PropList.txt
85 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
87 sub UCA_Version { "24" }
89 sub Base_Unicode_Version { "6.1.0" }
94 return pack('U*', @_);
101 blanked non-ignorable shifted shift-trimmed
102 / } = (); # keys lowercased
105 alternate backwards level normalization rearrange
106 katakana_before_hiragana upper_before_lower ignore_level2
107 overrideHangul overrideCJK preprocess UCA_Version
108 hangul_terminator variable
112 entry mapping table maxlength contraction
113 ignoreChar ignoreName undefChar undefName rewrite
114 versionTable alternateTable backwardsTable forwardsTable
115 rearrangeTable variableTable
116 derivCode normCode rearrangeHash backwardsFlag
117 suppress suppressHash
118 __useXS /; ### XS only
119 # The hash key 'ignored' is deleted at v 0.21.
120 # The hash key 'isShift' is deleted at v 0.23.
121 # The hash key 'combining' is deleted at v 0.24.
122 # The hash key 'entries' is deleted at v 0.30.
123 # The hash key 'L3_ignorable' is deleted at v 0.40.
127 return $self->{versionTable} || 'unknown';
130 my (%ChangeOK, %ChangeNG);
131 @ChangeOK{ @ChangeOK } = ();
132 @ChangeNG{ @ChangeNG } = ();
138 if (exists $hash{variable} && exists $hash{alternate}) {
139 delete $hash{alternate};
141 elsif (!exists $hash{variable} && exists $hash{alternate}) {
142 $hash{variable} = $hash{alternate};
144 foreach my $k (keys %hash) {
145 if (exists $ChangeOK{$k}) {
146 $old{$k} = $self->{$k};
147 $self->{$k} = $hash{$k};
149 elsif (exists $ChangeNG{$k}) {
150 croak "change of $k via change() is not allowed!";
154 $self->checkCollator();
155 return wantarray ? %old : $self;
160 my $key = shift; # 'level' or 'backwards'
161 MinLevel <= $level or croak sprintf
162 "Illegal level %d (in value for key '%s') lower than %d.",
163 $level, $key, MinLevel;
164 $level <= MaxLevel or croak sprintf
165 "Unsupported level %d (in value for key '%s') higher than %d.",
166 $level, $key, MaxLevel;
172 11 => \&_derivCE_9, # 11 == 9
174 16 => \&_derivCE_14, # 16 == 14
183 _checkLevel($self->{level}, "level");
185 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
186 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
188 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
189 $self->{alternateTable} || 'shifted';
190 $self->{variable} = $self->{alternate} = lc($self->{variable});
191 exists $VariableOK{ $self->{variable} }
192 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
194 if (! defined $self->{backwards}) {
195 $self->{backwardsFlag} = 0;
197 elsif (! ref $self->{backwards}) {
198 _checkLevel($self->{backwards}, "backwards");
199 $self->{backwardsFlag} = 1 << $self->{backwards};
203 $self->{backwardsFlag} = 0;
204 for my $b (@{ $self->{backwards} }) {
205 _checkLevel($b, "backwards");
208 for my $v (sort keys %level) {
209 $self->{backwardsFlag} += 1 << $v;
213 defined $self->{rearrange} or $self->{rearrange} = [];
214 ref $self->{rearrange}
215 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
217 # keys of $self->{rearrangeHash} are $self->{rearrange}.
218 $self->{rearrangeHash} = undef;
220 if (@{ $self->{rearrange} }) {
221 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
224 $self->{normCode} = undef;
226 if (defined $self->{normalization}) {
227 eval { require Unicode::Normalize };
228 $@ and croak "Unicode::Normalize is required to normalize strings";
230 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
232 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
233 $self->{normCode} = \&Unicode::Normalize::NFD;
235 elsif ($self->{normalization} ne 'prenormalized') {
236 my $norm = $self->{normalization};
237 $self->{normCode} = sub {
238 Unicode::Normalize::normalize($norm, shift);
240 eval { $self->{normCode}->("") }; # try
241 $@ and croak "$PACKAGE unknown normalization form name: $norm";
250 my $self = bless { @_ }, $class;
252 ### begin XS only ###
253 if (! exists $self->{table} && !defined $self->{rewrite} &&
254 !defined $self->{undefName} && !defined $self->{ignoreName} &&
255 !defined $self->{undefChar} && !defined $self->{ignoreChar}) {
256 $self->{__useXS} = \&_fetch_simple;
258 $self->{__useXS} = undef;
262 # keys of $self->{suppressHash} are $self->{suppress}.
263 if ($self->{suppress} && @{ $self->{suppress} }) {
264 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
265 } # before read_table()
267 # If undef is passed explicitly, no file is read.
268 $self->{table} = $KeyFile if ! exists $self->{table};
269 $self->read_table() if defined $self->{table};
271 if ($self->{entry}) {
272 while ($self->{entry} =~ /([^\n]+)/g) {
273 $self->parseEntry($1, TRUE);
277 $self->{level} ||= MaxLevel;
278 $self->{UCA_Version} ||= UCA_Version();
280 $self->{overrideHangul} = FALSE
281 if ! exists $self->{overrideHangul};
282 $self->{overrideCJK} = FALSE
283 if ! exists $self->{overrideCJK};
284 $self->{normalization} = 'NFD'
285 if ! exists $self->{normalization};
286 $self->{rearrange} = $self->{rearrangeTable} ||
287 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
288 if ! exists $self->{rearrange};
289 $self->{backwards} = $self->{backwardsTable}
290 if ! exists $self->{backwards};
292 $self->checkCollator();
299 my $line = shift; # after s/^\s*\@//
301 if ($line =~ /^version\s*(\S*)/) {
302 $self->{versionTable} ||= $1;
304 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
305 $self->{variableTable} ||= $1;
307 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
308 $self->{alternateTable} ||= $1;
310 elsif ($line =~ /^backwards\s+(\S*)/) {
311 push @{ $self->{backwardsTable} }, $1;
313 elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
314 push @{ $self->{forwardsTable} }, $1;
316 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
317 push @{ $self->{rearrangeTable} }, _getHexArray($1);
324 ### begin XS only ###
325 if ($self->{__useXS}) {
326 my @rest = _fetch_rest(); # complex matter need to parse
327 for my $line (@rest) {
328 next if $line =~ /^\s*#/;
330 if ($line =~ s/^\s*\@//) {
331 $self->parseAtmark($line);
333 $self->parseEntry($line);
341 foreach my $d (@INC) {
342 $f = File::Spec->catfile($d, @Path, $self->{table});
343 last if open($fh, $f);
347 $f = File::Spec->catfile(@Path, $self->{table});
348 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
351 while (my $line = <$fh>) {
352 next if $line =~ /^\s*#/;
354 if ($line =~ s/^\s*\@//) {
355 $self->parseAtmark($line);
357 $self->parseEntry($line);
365 ## get $line, parse it, and write an entry in $self
371 my $tailoring = shift;
372 my($name, $entry, @uv, @key);
374 if (defined $self->{rewrite}) {
375 $line = $self->{rewrite}->($line);
378 return if $line !~ /^\s*[0-9A-Fa-f]/;
380 # removes comment and gets name
382 if $line =~ s/[#%]\s*(.*)//;
383 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
386 my($e, $k) = split /;/, $line;
387 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
390 @uv = _getHexArray($e);
392 return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
393 exists $self->{suppressHash}{$uv[0]};
394 $entry = join(CODE_SEP, @uv); # in JCPS
396 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
397 my $ele = pack_U(@uv);
399 # regarded as if it were not entried in the table
401 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
403 # replaced as completely ignorable
404 $k = '[.0000.0000.0000.0000]'
405 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
408 # replaced as completely ignorable
409 $k = '[.0000.0000.0000.0000]'
410 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
412 my $is_L3_ignorable = TRUE;
414 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
415 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
416 my @wt = _getHexArray($arr);
417 push @key, pack(VCE_TEMPLATE, $var, @wt);
418 $is_L3_ignorable = FALSE
419 if $wt[0] || $wt[1] || $wt[2];
420 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
421 # is completely ignorable.
422 # For expansion, an entry $is_L3_ignorable
423 # if and only if "all" CEs are [.0000.0000.0000].
426 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
429 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
430 $self->{maxlength}{$uv[0]} = @uv;
436 my $fake_entry = join(CODE_SEP, @uv); # in JCPS
437 $self->{contraction}{$fake_entry} = 1;
446 $self->visualizeSortKey($self->getSortKey(@_));
451 ## arrayref of JCPS = splitEnt(string to be collated)
452 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
459 my $code = $self->{preprocess};
460 my $norm = $self->{normCode};
461 my $map = $self->{mapping};
462 my $max = $self->{maxlength};
463 my $reH = $self->{rearrangeHash};
464 my $vers = $self->{UCA_Version};
465 my $ver9 = $vers >= 9 && $vers <= 11;
466 my $uXS = $self->{__useXS}; ### XS only
471 $code and croak "Preprocess breaks character positions. "
472 . "Don't use with index(), match(), etc.";
473 $norm and croak "Normalization breaks character positions. "
474 . "Don't use with index(), match(), etc.";
479 $str = &$code($str) if ref $code;
480 $str = &$norm($str) if ref $norm;
483 # get array of Unicode code point of string.
484 my @src = unpack_U($str);
487 # Character positions are not kept if rearranged,
488 # then neglected if $wLen is true.
489 if ($reH && ! $wLen) {
490 for (my $i = 0; $i < @src; $i++) {
491 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
492 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
498 # remove a code point marked as a completely ignorable.
499 for (my $i = 0; $i < @src; $i++) {
500 if (_isIllegal($src[$i]) || $vers <= 20 && _isNonchar($src[$i])) {
503 $src[$i] = undef if $map->{ $src[$i] } &&
504 @{ $map->{ $src[$i] } } == 0;
505 ### begin XS only ###
507 $src[$i] = undef if _ignorable_simple($src[$i]);
513 for (my $i = 0; $i < @src; $i++) {
516 # skip removed code point
517 if (! defined $jcps) {
519 $buf[-1][2] = $i + 1;
528 my $temp_jcps = $jcps;
530 my $maxLen = $max->{$jcps};
532 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
533 next if ! defined $src[$p];
534 $temp_jcps .= CODE_SEP . $src[$p];
536 if ($map->{$temp_jcps}) {
542 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
543 # This process requires Unicode::Normalize.
544 # If "normalization" is undef, here should be skipped *always*
545 # (in spite of bool value of $CVgetCombinClass),
546 # since canonical ordering cannot be expected.
547 # Blocked combining character should not be contracted.
549 # $self->{normCode} is false in the case of "prenormalized".
550 if ($self->{normalization}) {
551 my $cont = $self->{contraction};
557 for (my $p = $i + 1; $p < @src; $p++) {
558 next if ! defined $src[$p];
559 my $curCC = $CVgetCombinClass->($src[$p]);
561 my $tail = CODE_SEP . $src[$p];
563 if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
564 $cont->{$jcps_uc.$tail})) {
571 if ($preCC != $curCC && $map->{$jcps.$tail}) {
579 if ($map->{$jcps_uc}) {
581 $src[$_] = undef for @out_uc;
583 $src[$_] = undef for @out;
588 # skip completely ignorable
589 if ($uXS && $jcps !~ /;/ && _ignorable_simple($jcps) || ### XS only
590 $map->{$jcps} && @{ $map->{$jcps} } == 0) {
592 $buf[-1][2] = $i + 1;
597 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
603 ## VCE = _pack_override(input, codepoint, derivCode)
605 sub _pack_override ($$$) {
611 return pack(VCE_TEMPLATE, NON_VAR, @$r);
612 } elsif (defined $r) {
613 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
620 ## list of VCE = getWt(JCPS)
626 my $map = $self->{mapping};
627 my $der = $self->{derivCode};
628 my $uXS = $self->{__useXS}; ### XS only
630 return if !defined $u;
631 return map($self->varCE($_), @{ $map->{$u} })
633 ### begin XS only ###
634 return map($self->varCE($_), _fetch_simple($u))
635 if $uXS && _exists_simple($u);
638 # JCPS must not be a contraction, then it's a code point.
639 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
640 my $hang = $self->{overrideHangul};
643 @hangulCE = map _pack_override($_, $u, $der), $hang->($u);
644 } elsif (!defined $hang) {
645 @hangulCE = $der->($u);
647 my $max = $self->{maxlength};
648 my @decH = _decompHangul($u);
651 my $contract = join(CODE_SEP, @decH);
652 @decH = ($contract) if $map->{$contract};
653 } else { # must be <@decH == 3>
654 if ($max->{$decH[0]}) {
655 my $contract = join(CODE_SEP, @decH);
656 if ($map->{$contract}) {
659 $contract = join(CODE_SEP, @decH[0,1]);
660 $map->{$contract} and @decH = ($contract, $decH[2]);
662 # even if V's ignorable, LT contraction is not supported.
663 # If such a situation were required, NFD should be used.
665 if (@decH == 3 && $max->{$decH[1]}) {
666 my $contract = join(CODE_SEP, @decH[1,2]);
667 $map->{$contract} and @decH = ($decH[0], $contract);
672 $map->{$_} ? @{ $map->{$_} } :
673 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
677 return map $self->varCE($_), @hangulCE;
679 my $cjk = $self->{overrideCJK};
680 my $vers = $self->{UCA_Version};
681 if ($cjk && _isUIdeo($u, $vers)) {
682 my @cjkCE = map _pack_override($_, $u, $der), $cjk->($u);
683 return map $self->varCE($_), @cjkCE;
685 if ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
686 return map $self->varCE($_), _uideoCE_8($u);
688 return map $self->varCE($_), $der->($u);
694 ## string sortkey = getSortKey(string arg)
699 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
700 my $vers = $self->{UCA_Version};
701 my $term = $self->{hangul_terminator};
703 my @buf; # weight arrays
706 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
707 foreach my $jcps (@$rEnt) {
708 # weird things like VL, TL-contraction are not considered!
709 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
710 if ($preHST && !$curHST || # hangul before non-hangul
711 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
712 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
713 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
717 push @buf, $self->getWt($jcps);
719 push @buf, $termCE if $preHST; # end at hangul
721 foreach my $jcps (@$rEnt) {
722 push @buf, $self->getWt($jcps);
726 return $self->mk_SortKey(\@buf); ### XS only
731 ## int compare = cmp(string a, string b)
733 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
734 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
735 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
736 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
737 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
738 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
739 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
742 ## list[strings] sorted = sort(list[strings] arg)
748 sort{ $a->[0] cmp $b->[0] }
749 map [ $obj->getSortKey($_), $_ ], @_;
754 ## bool _nonIgnorAtLevel(arrayref weights, int level)
756 sub _nonIgnorAtLevel($$)
759 return if ! defined $wt;
761 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
766 ## arrayref of arrayref[weights] source,
767 ## arrayref of arrayref[weights] substr,
769 ## * comparison of graphemes vs graphemes.
770 ## @$source >= @$substr must be true (check it before call this);
778 for my $g (0..@$substr-1){
779 # Do the $g'th graphemes have the same number of AV weights?
780 return if @{ $source->[$g] } != @{ $substr->[$g] };
782 for my $w (0..@{ $substr->[$g] }-1) {
783 for my $v (0..$lev-1) {
784 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
792 ## (int position, int length)
793 ## int position = index(string, substring, position, [undoc'ed global])
795 ## With "global" (only for the list context),
796 ## returns list of arrayref[position, length].
802 my $len = length($str);
803 my $subE = $self->splitEnt(shift);
804 my $pos = @_ ? shift : 0;
805 $pos = 0 if $pos < 0;
808 my $lev = $self->{level};
809 my $v2i = $self->{UCA_Version} >= 9 &&
810 $self->{variable} ne 'non-ignorable';
813 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
815 ? map([$_, 0], $temp..$len)
816 : wantarray ? ($temp,0) : $temp;
819 and return wantarray ? () : NOMATCHPOS;
820 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
822 or return wantarray ? () : NOMATCHPOS;
824 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
826 my $last_is_variable;
827 for my $vwt (map $self->getWt($_), @$subE) {
828 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
829 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
831 # "Ignorable (L1, L2) after Variable" since track. v. 9
834 $last_is_variable = TRUE;
836 elsif (!$wt[0]) { # ignorable
837 $to_be_pushed = FALSE if $last_is_variable;
840 $last_is_variable = FALSE;
844 if (@subWt && !$var && !$wt[0]) {
845 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
846 } elsif ($to_be_pushed) {
847 push @subWt, [ \@wt ];
853 my $end = @$strE - 1;
855 $last_is_variable = FALSE; # reuse
856 for (my $i = 0; $i <= $end; ) { # no $i++
860 while ($i <= $end && $found_base == 0) {
861 for my $vwt ($self->getWt($strE->[$i][0])) {
862 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
863 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
865 # "Ignorable (L1, L2) after Variable" since track. v. 9
868 $last_is_variable = TRUE;
870 elsif (!$wt[0]) { # ignorable
871 $to_be_pushed = FALSE if $last_is_variable;
874 $last_is_variable = FALSE;
878 if (@strWt && !$var && !$wt[0]) {
879 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
880 $finPos[-1] = $strE->[$i][2];
881 } elsif ($to_be_pushed) {
882 push @strWt, [ \@wt ];
883 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
884 $finPos[-1] = NOMATCHPOS if $found_base;
885 push @finPos, $strE->[$i][2];
894 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
895 if ($iniPos[0] != NOMATCHPOS &&
896 $finPos[$#subWt] != NOMATCHPOS &&
897 _eqArray(\@strWt, \@subWt, $lev)) {
898 my $temp = $iniPos[0] + $pos;
901 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
902 splice @strWt, 0, $#subWt;
903 splice @iniPos, 0, $#subWt;
904 splice @finPos, 0, $#subWt;
908 ? ($temp, $finPos[$#subWt] - $iniPos[0])
920 : wantarray ? () : NOMATCHPOS;
924 ## scalarref to matching part = match(string, substring)
929 if (my($pos,$len) = $self->index($_[0], $_[1])) {
930 my $temp = substr($_[0], $pos, $len);
931 return wantarray ? $temp : \$temp;
932 # An lvalue ref \substr should be avoided,
933 # since its value is affected by modification of its referent.
941 ## arrayref matching parts = gmatch(string, substring)
948 return map substr($str, $_->[0], $_->[1]),
949 $self->index($str, $sub, 0, 'g');
953 ## bool subst'ed = subst(string, substring, replace)
958 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
960 if (my($pos,$len) = $self->index($_[0], $_[1])) {
962 my $mat = substr($_[0], $pos, $len);
963 substr($_[0], $pos, $len, $code->($mat));
965 substr($_[0], $pos, $len, $_[2]);
975 ## int count = gsubst(string, substring, replace)
980 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
983 # Replacement is carried out from the end, then use reverse.
984 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
986 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
987 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
989 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1001 Unicode::Collate - Unicode Collation Algorithm
1005 use Unicode::Collate;
1008 $Collator = Unicode::Collate->new(%tailoring);
1011 @sorted = $Collator->sort(@not_sorted);
1014 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1016 B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1017 according to Perl's Unicode support. See L<perlunicode>,
1018 L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1019 Otherwise you can use C<preprocess> or should decode them before.
1023 This module is an implementation of Unicode Technical Standard #10
1024 (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1026 =head2 Constructor and Tailoring
1028 The C<new> method returns a collator object. If new() is called
1029 with no parameters, the collator should do the default collation.
1031 $Collator = Unicode::Collate->new(
1032 UCA_Version => $UCA_Version,
1033 alternate => $alternate, # alias for 'variable'
1034 backwards => $levelNumber, # or \@levelNumbers
1036 hangul_terminator => $term_primary_weight,
1037 ignoreName => qr/$ignoreName/,
1038 ignoreChar => qr/$ignoreChar/,
1039 ignore_level2 => $bool,
1040 katakana_before_hiragana => $bool,
1041 level => $collationLevel,
1042 normalization => $normalization_form,
1043 overrideCJK => \&overrideCJK,
1044 overrideHangul => \&overrideHangul,
1045 preprocess => \&preprocess,
1046 rearrange => \@charList,
1047 rewrite => \&rewrite,
1048 suppress => \@charList,
1050 undefName => qr/$undefName/,
1051 undefChar => qr/$undefChar/,
1052 upper_before_lower => $bool,
1053 variable => $variable,
1060 If the revision (previously "tracking version") number of UCA is given,
1061 behavior of that revision is emulated on collating.
1062 If omitted, the return value of C<UCA_Version()> is used.
1064 The following revisions are supported. The default is 24.
1066 UCA Unicode Standard DUCET (@version)
1067 -------------------------------------------------------
1068 8 3.1 3.0.1 (3.0.1d9)
1069 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1070 11 4.0 4.0.0 (4.0.0)
1071 14 4.1.0 4.1.0 (4.1.0)
1072 16 5.0 5.0.0 (5.0.0)
1073 18 5.1.0 5.1.0 (5.1.0)
1074 20 5.2.0 5.2.0 (5.2.0)
1075 22 6.0.0 6.0.0 (6.0.0)
1076 24 6.1.0 6.1.0 (6.1.0)
1078 * Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1079 since C<UCA_Version> 22.
1081 * Fully ignorable characters were ignored, and would not interrupt
1082 contractions with C<UCA_Version> 9 and 11.
1084 * Treatment of ignorables after variables and some behaviors
1085 were changed at C<UCA_Version> 9.
1087 * Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1088 depend on C<UCA_Version>.
1090 * Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1091 C<hangul_terminator>.
1095 -- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1097 For backward compatibility, C<alternate> (old name) can be used
1098 as an alias for C<variable>.
1102 -- see 3.1.2 French Accents, UTS #10.
1104 backwards => $levelNumber or \@levelNumbers
1106 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1107 If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1108 forwards at all the levels.
1112 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1114 If the same character (or a sequence of characters) exists
1115 in the collation element table through C<table>,
1116 mapping to collation elements is overridden.
1117 If it does not exist, the mapping is defined additionally.
1119 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1120 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1121 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1122 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1123 006C 006C ; [.0F4C.0020.0002.006C] # ll
1124 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1125 004C 004C ; [.0F4C.0020.0008.004C] # LL
1126 00F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1127 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1128 00D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1129 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1132 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1133 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1134 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1137 B<NOTE:> The code point in the UCA file format (before C<';'>)
1138 B<must> be a Unicode code point (defined as hexadecimal),
1139 but not a native code point.
1140 So C<0063> must always denote C<U+0063>,
1141 but not a character of C<"\x63">.
1143 Weighting may vary depending on collation element table.
1144 So ensure the weights defined in C<entry> will be consistent with
1145 those in the collation element table loaded via C<table>.
1147 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1148 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1149 (as a value between C<0E60> and C<0E6D>)
1150 makes ordering as C<C E<lt> CH E<lt> D>.
1151 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1152 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1153 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1154 and C<c-curl> (C<U+0255>) with C<0E69>.
1155 Then primary weight C<0E6A> for C<CH> makes C<CH>
1156 ordered between C<c-curl> and C<D>.
1158 =item hangul_terminator
1160 -- see 7.1.4 Trailing Weights, UTS #10.
1162 If a true value is given (non-zero but should be positive),
1163 it will be added as a terminator primary weight to the end of
1164 every standard Hangul syllable. Secondary and any higher weights
1165 for terminator are set to zero.
1166 If the value is false or C<hangul_terminator> key does not exist,
1167 insertion of terminator weights will not be performed.
1169 Boundaries of Hangul syllables are determined
1170 according to conjoining Jamo behavior in F<the Unicode Standard>
1171 and F<HangulSyllableType.txt>.
1173 B<Implementation Note:>
1174 (1) For expansion mapping (Unicode character mapped
1175 to a sequence of collation elements), a terminator will not be added
1176 between collation elements, even if Hangul syllable boundary exists there.
1177 Addition of terminator is restricted to the next position
1178 to the last collation element.
1180 (2) Non-conjoining Hangul letters
1181 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1182 automatically terminated with a terminator primary weight.
1183 These characters may need terminator included in a collation element
1190 -- see 3.2.2 Variable Weighting, UTS #10.
1192 Makes the entry in the table completely ignorable;
1193 i.e. as if the weights were zero at all level.
1195 Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1196 will be ignored. Through C<ignoreName>, any character whose name
1197 (given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1200 E.g. when 'a' and 'e' are ignorable,
1201 'element' is equal to 'lament' (or 'lmnt').
1205 -- see 5.1 Parametric Tailoring, UTS #10.
1207 By default, case-sensitive comparison (that is level 3 difference)
1208 won't ignore accents (that is level 2 difference).
1210 If the parameter is made true, accents (and other primary ignorable
1211 characters) are ignored, even though cases are taken into account.
1213 B<NOTE>: C<level> should be 3 or greater.
1215 =item katakana_before_hiragana
1217 -- see 7.3.1 Tertiary Weight Table, UTS #10.
1219 By default, hiragana is before katakana.
1220 If the parameter is made true, this is reversed.
1222 B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1223 distinctions must occur in level 3, and their weights at level 3 must be
1224 same as those mentioned in 7.3.1, UTS #10.
1225 If you define your collation elements which violate this requirement,
1226 this parameter does not work validly.
1230 -- see 4.3 Form Sort Key, UTS #10.
1232 Set the maximum level.
1233 Any higher levels than the specified one are ignored.
1235 Level 1: alphabetic ordering
1236 Level 2: diacritic ordering
1237 Level 3: case ordering
1238 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1242 If omitted, the maximum is the 4th.
1246 -- see 4.1 Normalize, UTS #10.
1248 If specified, strings are normalized before preparation of sort keys
1249 (the normalization is executed after preprocess).
1251 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1252 as C<$normalization_form>.
1253 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1254 See C<Unicode::Normalize::normalize()> for detail.
1255 If omitted, C<'NFD'> is used.
1257 C<normalization> is performed after C<preprocess> (if defined).
1259 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1260 though they are not concerned with C<Unicode::Normalize::normalize()>.
1262 If C<undef> (not a string C<"undef">) is passed explicitly
1263 as the value for this key,
1264 any normalization is not carried out (this may make tailoring easier
1265 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1266 only contiguous contractions are resolved;
1267 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1268 C<A-cedilla-ring> would be primary equal to C<A>.
1270 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1271 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1273 In the case of C<(normalization =E<gt> "prenormalized")>,
1274 any normalization is not performed, but
1275 discontiguous contractions with combining characters are performed.
1277 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1278 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1279 If source strings are finely prenormalized,
1280 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1282 Except C<(normalization =E<gt> undef)>,
1283 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1287 -- see 7.1 Derived Collation Elements, UTS #10.
1289 By default, CJK unified ideographs are ordered in Unicode codepoint
1290 order, but those in the CJK Unified Ideographs block are lesser than
1291 those in the CJK Unified Ideographs Extension A etc.
1293 In the CJK Unified Ideographs block:
1294 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1295 U+4E00..U+9FBB if UCA_Version is 14 or 16.
1296 U+4E00..U+9FC3 if UCA_Version is 18.
1297 U+4E00..U+9FCB if UCA_Version is 20 or 22.
1298 U+4E00..U+9FCC if UCA_Version is 24.
1300 In the CJK Unified Ideographs Extension blocks:
1301 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
1302 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or greater.
1303 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or greater.
1305 Through C<overrideCJK>, ordering of CJK unified ideographs (including
1306 extensions) can be overridden.
1308 ex. CJK unified ideographs in the JIS code point order.
1310 overrideCJK => sub {
1311 my $u = shift; # get a Unicode codepoint
1312 my $b = pack('n', $u); # to UTF-16BE
1313 my $s = your_unicode_to_sjis_converter($b); # convert
1314 my $n = unpack('n', $s); # convert sjis to short
1315 [ $n, 0x20, 0x2, $u ]; # return the collation element
1318 The return value may be an arrayref of 1st to 4th weights as shown
1319 above. The return value may be an integer as the primary weight
1320 as shown below. If C<undef> is returned, the default derived
1321 collation element will be used.
1323 overrideCJK => sub {
1324 my $u = shift; # get a Unicode codepoint
1325 my $b = pack('n', $u); # to UTF-16BE
1326 my $s = your_unicode_to_sjis_converter($b); # convert
1327 my $n = unpack('n', $s); # convert sjis to short
1328 return $n; # return the primary weight
1331 The return value may be a list containing zero or more of
1332 an arrayref, an integer, or C<undef>.
1334 ex. ignores all CJK unified ideographs.
1336 overrideCJK => sub {()}, # CODEREF returning empty list
1338 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1339 # as U+4E00 is a CJK unified ideograph and to be ignorable.
1341 If C<undef> is passed explicitly as the value for this key,
1342 weights for CJK unified ideographs are treated as undefined.
1343 But assignment of weight for CJK unified ideographs
1344 in C<table> or C<entry> is still valid.
1346 B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1347 C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1348 C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1349 ideographs. But they can't be overridden via C<overrideCJK> when you use
1350 DUCET, as the table includes weights for them. C<table> or C<entry> has
1351 priority over C<overrideCJK>.
1353 =item overrideHangul
1355 -- see 7.1 Derived Collation Elements, UTS #10.
1357 By default, Hangul syllables are decomposed into Hangul Jamo,
1358 even if C<(normalization =E<gt> undef)>.
1359 But the mapping of Hangul syllables may be overridden.
1361 This parameter works like C<overrideCJK>, so see there for examples.
1363 If you want to override the mapping of Hangul syllables,
1364 NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1365 Hangul syllables before overriding. FCD may decompose Hangul syllables
1368 If C<undef> is passed explicitly as the value for this key,
1369 weight for Hangul syllables is treated as undefined
1370 without decomposition into Hangul Jamo.
1371 But definition of weight for Hangul syllables
1372 in C<table> or C<entry> is still valid.
1376 -- see 5.1 Preprocessing, UTS #10.
1378 If specified, the coderef is used to preprocess each string
1379 before the formation of sort keys.
1381 ex. dropping English articles, such as "a" or "the".
1382 Then, "the pen" is before "a pencil".
1386 $str =~ s/\b(?:an?|the)\s+//gi;
1390 C<preprocess> is performed before C<normalization> (if defined).
1392 ex. decoding strings in a legacy encoding such as shift-jis:
1394 $sjis_collator = Unicode::Collate->new(
1395 preprocess => \&your_shiftjis_to_unicode_decoder,
1397 @result = $sjis_collator->sort(@shiftjis_strings);
1399 B<Note:> Strings returned from the coderef will be interpreted
1400 according to Perl's Unicode support. See L<perlunicode>,
1401 L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1405 -- see 3.1.3 Rearrangement, UTS #10.
1407 Characters that are not coded in logical order and to be rearranged.
1408 If C<UCA_Version> is equal to or lesser than 11, default is:
1410 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1412 If you want to disallow any rearrangement, pass C<undef> or C<[]>
1413 (a reference to empty list) as the value for this key.
1415 If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1416 (i.e. no rearrangement).
1418 B<According to the version 9 of UCA, this parameter shall not be used;
1419 but it is not warned at present.>
1423 If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1424 The coderef will get each line, and then should return a rewritten line
1425 according to the UCA file format.
1426 If the coderef returns an empty line, the line will be skipped.
1428 e.g. any primary ignorable characters into tertiary ignorable:
1432 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1436 This example shows rewriting weights. C<rewrite> is allowed to
1437 affect code points, weights, and the name.
1439 B<NOTE>: C<table> is available to use another table file;
1440 preparing a modified table once would be more efficient than
1441 rewriting lines on reading an unmodified table every time.
1445 -- see suppress contractions in 5.14.11 Special-Purpose Commands,
1448 Contractions beginning with the specified characters are suppressed,
1449 even if those contractions are defined in C<table>.
1451 An example for Russian and some languages using the Cyrillic script:
1453 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1455 where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1457 B<NOTE>: Contractions via C<entry> are not be suppressed.
1461 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1463 You can use another collation element table if desired.
1465 The table file should locate in the F<Unicode/Collate> directory
1466 on C<@INC>. Say, if the filename is F<Foo.txt>,
1467 the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1469 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1470 If you will prepare your own table file, any name other than F<allkeys.txt>
1471 may be better to avoid namespace conflict.
1473 B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1474 module, and it may save time at the run time.
1475 Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
1476 or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1477 C<rewrite> will prevent this module from using the compiled DUCET.
1479 If C<undef> is passed explicitly as the value for this key,
1480 no file is read (but you can define collation elements via C<entry>).
1482 A typical way to define a collation element table
1483 without any file of table:
1485 $onlyABC = Unicode::Collate->new(
1487 entry => << 'ENTRIES',
1488 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1489 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1490 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1491 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1492 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1493 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1497 If C<ignoreName> or C<undefName> is used, character names should be
1498 specified as a comment (following C<#>) on each line.
1504 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1506 Undefines the collation element as if it were unassigned in the C<table>.
1507 This reduces the size of the table.
1508 If an unassigned character appears in the string to be collated,
1509 the sort key is made from its codepoint
1510 as a single-character collation element,
1511 as it is greater than any other assigned collation elements
1512 (in the codepoint order among the unassigned characters).
1513 But, it'd be better to ignore characters
1514 unfamiliar to you and maybe never used.
1516 Through C<undefChar>, any character matching C<qr/$undefChar/>
1517 will be undefined. Through C<undefName>, any character whose name
1518 (given in the C<table> file as a comment) matches C<qr/$undefName/>
1521 ex. Collation weights for beyond-BMP characters are not stored in object:
1523 undefChar => qr/[^\0-\x{fffd}]/,
1525 =item upper_before_lower
1527 -- see 6.6 Case Comparisons, UTS #10.
1529 By default, lowercase is before uppercase.
1530 If the parameter is made true, this is reversed.
1532 B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1533 distinctions must occur in level 3, and their weights at level 3 must be
1534 same as those mentioned in 7.3.1, UTS #10.
1535 If you define your collation elements which differs from this requirement,
1536 this parameter doesn't work validly.
1540 -- see 3.2.2 Variable Weighting, UTS #10.
1542 This key allows for variable weighting of variable collation elements,
1543 which are marked with an ASTERISK in the table
1544 (NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
1546 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1548 These names are case-insensitive.
1549 By default (if specification is omitted), 'shifted' is adopted.
1551 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1552 considered at the 4th level.
1554 'Non-Ignorable' Variable elements are not reset to ignorable.
1556 'Shifted' Variable elements are made ignorable at levels 1 through 3
1557 their level 4 weight is replaced by the old level 1 weight.
1558 Level 4 weight for Non-Variable elements is 0xFFFF.
1560 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1565 =head2 Methods for Collation
1569 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1571 Sorts a list of strings.
1573 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1575 Returns 1 (when C<$a> is greater than C<$b>)
1576 or 0 (when C<$a> is equal to C<$b>)
1577 or -1 (when C<$a> is lesser than C<$b>).
1579 =item C<$result = $Collator-E<gt>eq($a, $b)>
1581 =item C<$result = $Collator-E<gt>ne($a, $b)>
1583 =item C<$result = $Collator-E<gt>lt($a, $b)>
1585 =item C<$result = $Collator-E<gt>le($a, $b)>
1587 =item C<$result = $Collator-E<gt>gt($a, $b)>
1589 =item C<$result = $Collator-E<gt>ge($a, $b)>
1591 They works like the same name operators as theirs.
1593 eq : whether $a is equal to $b.
1594 ne : whether $a is not equal to $b.
1595 lt : whether $a is lesser than $b.
1596 le : whether $a is lesser than $b or equal to $b.
1597 gt : whether $a is greater than $b.
1598 ge : whether $a is greater than $b or equal to $b.
1600 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1602 -- see 4.3 Form Sort Key, UTS #10.
1606 You compare the sort keys using a binary comparison
1607 and get the result of the comparison of the strings using UCA.
1609 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1613 $Collator->cmp($a, $b)
1615 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1617 Converts a sorting key into its representation form.
1618 If C<UCA_Version> is 8, the output is slightly different.
1620 use Unicode::Collate;
1621 my $c = Unicode::Collate->new();
1622 print $c->viewSortKey("Perl"),"\n";
1625 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1626 # Level 1 Level 2 Level 3 Level 4
1630 =head2 Methods for Searching
1632 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1633 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1634 but they are not aware of any pattern, but only a literal substring.
1636 B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1637 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1638 C<subst>, C<gsubst>) is croaked, as the position and the length might
1639 differ from those on the specified string.
1641 C<rearrange> and C<hangul_terminator> parameters are neglected.
1642 C<katakana_before_hiragana> and C<upper_before_lower> don't affect
1643 matching and searching, as it doesn't matter whether greater or lesser.
1647 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1649 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1651 If C<$substring> matches a part of C<$string>, returns
1652 the position of the first occurrence of the matching part in scalar context;
1653 in list context, returns a two-element list of
1654 the position and the length of the matching part.
1656 If C<$substring> does not match any part of C<$string>,
1657 returns C<-1> in scalar context and
1658 an empty list in list context.
1662 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1663 # (normalization => undef) is REQUIRED.
1664 my $str = "Ich muß studieren Perl.";
1667 if (my($pos,$len) = $Collator->index($str, $sub)) {
1668 $match = substr($str, $pos, $len);
1671 and get C<"muß"> in C<$match> since C<"muß">
1672 is primary equal to C<"MÜSS">.
1674 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1676 =item C<($match) = $Collator-E<gt>match($string, $substring)>
1678 If C<$substring> matches a part of C<$string>, in scalar context, returns
1679 B<a reference to> the first occurrence of the matching part
1680 (C<$match_ref> is always true if matches,
1681 since every reference is B<true>);
1682 in list context, returns the first occurrence of the matching part.
1684 If C<$substring> does not match any part of C<$string>,
1685 returns C<undef> in scalar context and
1686 an empty list in list context.
1690 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1691 print "matches [$$match_ref].\n";
1693 print "doesn't match.\n";
1698 if (($match) = $Collator->match($str, $sub)) { # list context
1699 print "matches [$match].\n";
1701 print "doesn't match.\n";
1704 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1706 If C<$substring> matches a part of C<$string>, returns
1707 all the matching parts (or matching count in scalar context).
1709 If C<$substring> does not match any part of C<$string>,
1710 returns an empty list.
1712 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1714 If C<$substring> matches a part of C<$string>,
1715 the first occurrence of the matching part is replaced by C<$replacement>
1716 (C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
1718 C<$replacement> can be a C<CODEREF>,
1719 taking the matching part as an argument,
1720 and returning a string to replace the matching part
1721 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1723 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1725 If C<$substring> matches a part of C<$string>,
1726 all the occurrences of the matching part are replaced by C<$replacement>
1727 (C<$string> is modified) and C<$count> is returned.
1729 C<$replacement> can be a C<CODEREF>,
1730 taking the matching part as an argument,
1731 and returning a string to replace the matching part
1732 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1736 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1737 # (normalization => undef) is REQUIRED.
1738 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
1739 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1741 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
1742 # i.e., all the camels are made bold-faced.
1744 Examples: levels and ignore_level2 - what does camel match?
1745 ---------------------------------------------------------------------------
1746 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l
1747 -----------------------|---------------------------------------------------
1748 1 false | yes yes yes yes yes
1749 2 false | yes yes no yes yes
1750 3 false | yes no no yes yes
1751 4 false | yes no no no yes
1752 -----------------------|---------------------------------------------------
1753 1 true | yes yes yes yes yes
1754 2 true | yes yes yes yes yes
1755 3 true | yes no yes yes yes
1756 4 true | yes no yes no yes
1757 ---------------------------------------------------------------------------
1758 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1763 =head2 Other Methods
1767 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1769 =item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1771 Changes the value of specified keys and returns the changed part.
1773 $Collator = Unicode::Collate->new(level => 4);
1775 $Collator->eq("perl", "PERL"); # false
1777 %old = $Collator->change(level => 2); # returns (level => 4).
1779 $Collator->eq("perl", "PERL"); # true
1781 $Collator->change(%old); # returns (level => 2).
1783 $Collator->eq("perl", "PERL"); # false
1785 Not all C<(key,value)>s are allowed to be changed.
1786 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1788 In the scalar context, returns the modified collator
1789 (but it is B<not> a clone from the original).
1791 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1793 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1795 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1797 =item C<$version = $Collator-E<gt>version()>
1799 Returns the version number (a string) of the Unicode Standard
1800 which the C<table> file used by the collator object is based on.
1801 If the table does not include a version line (starting with C<@version>),
1802 returns C<"unknown">.
1804 =item C<UCA_Version()>
1806 Returns the revision number of UTS #10 this module consults,
1807 that should correspond with the DUCET incorporated.
1809 =item C<Base_Unicode_Version()>
1811 Returns the version number of UTS #10 this module consults,
1812 that should correspond with the DUCET incorporated.
1818 No method will be exported.
1822 Though this module can be used without any C<table> file,
1823 to use this module easily, it is recommended to install a table file
1824 in the UCA format, by copying it under the directory
1825 <a place in @INC>/Unicode/Collate.
1827 The most preferable one is "The Default Unicode Collation Element Table"
1828 (aka DUCET), available from the Unicode Consortium's website:
1830 http://www.unicode.org/Public/UCA/
1832 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1834 If DUCET is not installed, it is recommended to copy the file
1835 from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1836 to <a place in @INC>/Unicode/Collate/allkeys.txt
1845 Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1846 module (see L<Unicode::Normalize>).
1848 If you need not it (say, in the case when you need not
1849 handle any combining characters),
1850 assign C<normalization =E<gt> undef> explicitly.
1852 -- see 6.5 Avoiding Normalization, UTS #10.
1854 =item Conformance Test
1856 The Conformance Test for the UCA is available
1857 under L<http://www.unicode.org/Public/UCA/>.
1859 For F<CollationTest_SHIFTED.txt>,
1860 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1861 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1862 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1864 B<Unicode::Normalize is required to try The Conformance Test.>
1868 =head1 AUTHOR, COPYRIGHT AND LICENSE
1870 The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1871 <SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2012,
1872 SADAHIRO Tomoyuki. Japan. All rights reserved.
1874 This module is free software; you can redistribute it and/or
1875 modify it under the same terms as Perl itself.
1877 The file Unicode/Collate/allkeys.txt was copied verbatim
1878 from L<http://www.unicode.org/Public/UCA/6.1.0/allkeys.txt>.
1879 For this file, Copyright (c) 2001-2011 Unicode, Inc.
1880 Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
1886 =item Unicode Collation Algorithm - UTS #10
1888 L<http://www.unicode.org/reports/tr10/>
1890 =item The Default Unicode Collation Element Table (DUCET)
1892 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1894 =item The conformance test for the UCA
1896 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1898 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1900 =item Hangul Syllable Type
1902 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1904 =item Unicode Normalization Forms - UAX #15
1906 L<http://www.unicode.org/reports/tr15/>
1908 =item Unicode Locale Data Markup Language (LDML) - UTS #35
1910 L<http://www.unicode.org/reports/tr35/>