1 package Unicode::Collate;
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
7 unless (0x41 == unpack('U', 'A')) {
8 die "Unicode::Collate cannot get a Unicode code point\n";
20 our $VERSION = '1.07';
21 our $PACKAGE = __PACKAGE__;
25 our @ISA = qw(DynaLoader);
26 bootstrap Unicode::Collate $VERSION;
29 my @Path = qw(Unicode Collate);
30 my $KeyFile = "allkeys.txt";
33 use constant TRUE => 1;
34 use constant FALSE => "";
35 use constant NOMATCHPOS => -1;
37 # A coderef to get combining class imported from Unicode::Normalize
38 # (i.e. \&Unicode::Normalize::getCombinClass).
39 # This is also used as a HAS_UNICODE_NORMALIZE flag.
43 use constant MinLevel => 1;
44 use constant MaxLevel => 4;
46 # Minimum weights at level 2 and 3, respectively
47 use constant Min2Wt => 0x20;
48 use constant Min3Wt => 0x02;
50 # Shifted weight at 4th level
51 use constant Shift4Wt => 0xFFFF;
53 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
54 use constant VCE_TEMPLATE => 'Cn4';
56 # A sort key: 16-bit weights
57 use constant KEY_TEMPLATE => 'n*';
59 # The tie-breaking: 32-bit weights
60 use constant TIE_TEMPLATE => 'N*';
62 # Level separator in a sort key:
63 # i.e. pack(KEY_TEMPLATE, 0)
64 use constant LEVEL_SEP => "\0\0";
66 # As Unicode code point separator for hash keys.
67 # A joined code point string (denoted by JCPS below)
68 # like "65;768" is used for internal processing
69 # instead of Perl's Unicode string like "\x41\x{300}",
70 # as the native code point is different from the Unicode code point
72 # This character must not be included in any stringified
73 # representation of an integer.
74 use constant CODE_SEP => ';';
75 # NOTE: in regex /;/ is used for $jcps!
77 # boolean values of variable weights
78 use constant NON_VAR => 0; # Non-Variable character
79 use constant VAR => 1; # Variable character
81 # specific code points
82 use constant Hangul_SIni => 0xAC00;
83 use constant Hangul_SFin => 0xD7A3;
85 # Logical_Order_Exception in PropList.txt
86 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
88 # for highestFFFF and minimalFFFE
89 my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
90 my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE);
92 sub UCA_Version { "28" }
94 sub Base_Unicode_Version { "6.3.0" }
99 return pack('U*', @_);
102 ### begin XS only ###
103 *unpack_U = exists &Unicode::Collate::bootstrap &&
104 $] < 5.008 && \&unpackUfor56 && 0x41 == unpackUfor56('A')
105 ? \&unpackUfor56 : sub { return unpack('U*', shift(@_).pack('U*')) };
112 blanked non-ignorable shifted shift-trimmed
113 / } = (); # keys lowercased
116 alternate backwards level normalization rearrange
117 katakana_before_hiragana upper_before_lower ignore_level2
118 overrideCJK overrideHangul overrideOut preprocess UCA_Version
119 hangul_terminator variable identical highestFFFF minimalFFFE
124 entry mapping table maxlength contraction
125 ignoreChar ignoreName undefChar undefName rewrite
126 versionTable alternateTable backwardsTable forwardsTable
127 rearrangeTable variableTable
128 derivCode normCode rearrangeHash backwardsFlag
129 suppress suppressHash
130 __useXS /; ### XS only
131 # The hash key 'ignored' was deleted at v 0.21.
132 # The hash key 'isShift' was deleted at v 0.23.
133 # The hash key 'combining' was deleted at v 0.24.
134 # The hash key 'entries' was deleted at v 0.30.
135 # The hash key 'L3_ignorable' was deleted at v 0.40.
139 return $self->{versionTable} || 'unknown';
142 my (%ChangeOK, %ChangeNG);
143 @ChangeOK{ @ChangeOK } = ();
144 @ChangeNG{ @ChangeNG } = ();
150 if (exists $hash{alternate}) {
151 if (exists $hash{variable}) {
152 delete $hash{alternate};
154 $hash{variable} = $hash{alternate};
157 foreach my $k (keys %hash) {
158 if (exists $ChangeOK{$k}) {
159 $old{$k} = $self->{$k};
160 $self->{$k} = $hash{$k};
161 } elsif (exists $ChangeNG{$k}) {
162 croak "change of $k via change() is not allowed!";
166 $self->checkCollator();
167 return wantarray ? %old : $self;
172 my $key = shift; # 'level' or 'backwards'
173 MinLevel <= $level or croak sprintf
174 "Illegal level %d (in value for key '%s') lower than %d.",
175 $level, $key, MinLevel;
176 $level <= MaxLevel or croak sprintf
177 "Unsupported level %d (in value for key '%s') higher than %d.",
178 $level, $key, MaxLevel;
184 11 => \&_derivCE_9, # 11 == 9
186 16 => \&_derivCE_14, # 16 == 14
191 26 => \&_derivCE_24, # 26 == 24
192 28 => \&_derivCE_24, # 28 == 24
197 _checkLevel($self->{level}, "level");
199 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
200 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
202 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
203 $self->{alternateTable} || 'shifted';
204 $self->{variable} = $self->{alternate} = lc($self->{variable});
205 exists $VariableOK{ $self->{variable} }
206 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
208 if (! defined $self->{backwards}) {
209 $self->{backwardsFlag} = 0;
210 } elsif (! ref $self->{backwards}) {
211 _checkLevel($self->{backwards}, "backwards");
212 $self->{backwardsFlag} = 1 << $self->{backwards};
215 $self->{backwardsFlag} = 0;
216 for my $b (@{ $self->{backwards} }) {
217 _checkLevel($b, "backwards");
220 for my $v (sort keys %level) {
221 $self->{backwardsFlag} += 1 << $v;
225 defined $self->{rearrange} or $self->{rearrange} = [];
226 ref $self->{rearrange}
227 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
229 # keys of $self->{rearrangeHash} are $self->{rearrange}.
230 $self->{rearrangeHash} = undef;
232 if (@{ $self->{rearrange} }) {
233 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
236 $self->{normCode} = undef;
238 if (defined $self->{normalization}) {
239 eval { require Unicode::Normalize };
240 $@ and croak "Unicode::Normalize is required to normalize strings";
242 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
244 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
245 $self->{normCode} = \&Unicode::Normalize::NFD;
247 elsif ($self->{normalization} ne 'prenormalized') {
248 my $norm = $self->{normalization};
249 $self->{normCode} = sub {
250 Unicode::Normalize::normalize($norm, shift);
252 eval { $self->{normCode}->("") }; # try
253 $@ and croak "$PACKAGE unknown normalization form name: $norm";
262 my $self = bless { @_ }, $class;
264 ### begin XS only ###
265 if (! exists $self->{table} && !defined $self->{rewrite} &&
266 !defined $self->{undefName} && !defined $self->{ignoreName} &&
267 !defined $self->{undefChar} && !defined $self->{ignoreChar}) {
268 $self->{__useXS} = \&_fetch_simple;
270 $self->{__useXS} = undef;
274 # keys of $self->{suppressHash} are $self->{suppress}.
275 if ($self->{suppress} && @{ $self->{suppress} }) {
276 @{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
277 } # before read_table()
279 # If undef is passed explicitly, no file is read.
280 $self->{table} = $KeyFile if ! exists $self->{table};
281 $self->read_table() if defined $self->{table};
283 if ($self->{entry}) {
284 while ($self->{entry} =~ /([^\n]+)/g) {
285 $self->parseEntry($1, TRUE);
289 # only in new(), not in change()
290 $self->{level} ||= MaxLevel;
291 $self->{UCA_Version} ||= UCA_Version();
293 $self->{overrideHangul} = FALSE
294 if ! exists $self->{overrideHangul};
295 $self->{overrideCJK} = FALSE
296 if ! exists $self->{overrideCJK};
297 $self->{normalization} = 'NFD'
298 if ! exists $self->{normalization};
299 $self->{rearrange} = $self->{rearrangeTable} ||
300 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
301 if ! exists $self->{rearrange};
302 $self->{backwards} = $self->{backwardsTable}
303 if ! exists $self->{backwards};
304 exists $self->{long_contraction} or $self->{long_contraction}
305 = 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
307 # checkCollator() will be called in change()
308 $self->checkCollator();
315 my $line = shift; # after s/^\s*\@//
317 if ($line =~ /^version\s*(\S*)/) {
318 $self->{versionTable} ||= $1;
320 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
321 $self->{variableTable} ||= $1;
323 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
324 $self->{alternateTable} ||= $1;
326 elsif ($line =~ /^backwards\s+(\S*)/) {
327 push @{ $self->{backwardsTable} }, $1;
329 elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
330 push @{ $self->{forwardsTable} }, $1;
332 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
333 push @{ $self->{rearrangeTable} }, _getHexArray($1);
340 ### begin XS only ###
341 if ($self->{__useXS}) {
342 my @rest = _fetch_rest(); # complex matter need to parse
343 for my $line (@rest) {
344 next if $line =~ /^\s*#/;
346 if ($line =~ s/^\s*\@//) {
347 $self->parseAtmark($line);
349 $self->parseEntry($line);
357 foreach my $d (@INC) {
358 $f = File::Spec->catfile($d, @Path, $self->{table});
359 last if open($fh, $f);
363 $f = File::Spec->catfile(@Path, $self->{table});
364 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
367 while (my $line = <$fh>) {
368 next if $line =~ /^\s*#/;
370 if ($line =~ s/^\s*\@//) {
371 $self->parseAtmark($line);
373 $self->parseEntry($line);
381 ## get $line, parse it, and write an entry in $self
387 my $tailoring = shift;
388 my($name, $entry, @uv, @key);
390 if (defined $self->{rewrite}) {
391 $line = $self->{rewrite}->($line);
394 return if $line !~ /^\s*[0-9A-Fa-f]/;
396 # removes comment and gets name
398 if $line =~ s/[#%]\s*(.*)//;
399 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
402 my($e, $k) = split /;/, $line;
403 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
406 @uv = _getHexArray($e);
408 return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
409 exists $self->{suppressHash}{$uv[0]};
410 $entry = join(CODE_SEP, @uv); # in JCPS
412 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
413 my $ele = pack_U(@uv);
415 # regarded as if it were not stored in the table
417 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
419 # replaced as completely ignorable
420 $k = '[.0000.0000.0000.0000]'
421 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
424 # replaced as completely ignorable
425 $k = '[.0000.0000.0000.0000]'
426 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
428 my $is_L3_ignorable = TRUE;
430 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
431 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
432 my @wt = _getHexArray($arr);
433 push @key, pack(VCE_TEMPLATE, $var, @wt);
434 $is_L3_ignorable = FALSE
435 if $wt[0] || $wt[1] || $wt[2];
436 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
437 # is completely ignorable.
438 # For expansion, an entry $is_L3_ignorable
439 # if and only if "all" CEs are [.0000.0000.0000].
442 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
445 if (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) {
446 $self->{maxlength}{$uv[0]} = @uv;
451 my $fake_entry = join(CODE_SEP, @uv); # in JCPS
452 $self->{contraction}{$fake_entry} = 1;
461 $self->visualizeSortKey($self->getSortKey($str));
469 my $prep = $self->{preprocess};
470 my $norm = $self->{normCode};
472 $str = &$prep($str) if ref $prep;
473 $str = &$norm($str) if ref $norm;
478 ## arrayref of JCPS = splitEnt(string to be collated)
479 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
485 my $wLen = shift; # with Length
487 my $map = $self->{mapping};
488 my $max = $self->{maxlength};
489 my $reH = $self->{rearrangeHash};
490 my $vers = $self->{UCA_Version};
491 my $ver9 = $vers >= 9 && $vers <= 11;
492 my $long = $self->{long_contraction};
493 my $uXS = $self->{__useXS}; ### XS only
497 # get array of Unicode code point of string.
498 my @src = unpack_U($str);
501 # Character positions are not kept if rearranged,
502 # then neglected if $wLen is true.
503 if ($reH && ! $wLen) {
504 for (my $i = 0; $i < @src; $i++) {
505 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
506 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
512 # remove a code point marked as a completely ignorable.
513 for (my $i = 0; $i < @src; $i++) {
514 if ($vers <= 20 && _isIllegal($src[$i])) {
517 $src[$i] = undef if $map->{ $src[$i] }
518 ? @{ $map->{ $src[$i] } } == 0
519 : $uXS && _ignorable_simple($src[$i]); ### XS only
523 for (my $i = 0; $i < @src; $i++) {
526 # skip removed code point
527 if (! defined $jcps) {
529 $buf[-1][2] = $i + 1;
538 my $temp_jcps = $jcps;
540 my $maxLen = $max->{$jcps};
542 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
543 next if ! defined $src[$p];
544 $temp_jcps .= CODE_SEP . $src[$p];
546 if ($map->{$temp_jcps}) {
552 # discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
553 # This process requires Unicode::Normalize.
554 # If "normalization" is undef, here should be skipped *always*
555 # (in spite of bool value of $CVgetCombinClass),
556 # since canonical ordering cannot be expected.
557 # Blocked combining character should not be contracted.
559 # $self->{normCode} is false in the case of "prenormalized".
560 if ($self->{normalization}) {
561 my $cont = $self->{contraction};
567 for (my $p = $i + 1; $p < @src; $p++) {
568 next if ! defined $src[$p];
569 my $curCC = $CVgetCombinClass->($src[$p]);
571 my $tail = CODE_SEP . $src[$p];
573 if ($preCC != $curCC && $map->{$jcps.$tail}) {
582 if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} ||
583 $cont->{$jcps_uc.$tail})) {
591 if (@out_uc && $map->{$jcps_uc}) {
593 $src[$_] = undef for @out_uc;
595 $src[$_] = undef for @out;
600 # skip completely ignorable
601 if ($map->{$jcps} ? @{ $map->{$jcps} } == 0 :
602 $uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
604 $buf[-1][2] = $i + 1;
609 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
615 ## VCE = _pack_override(input, codepoint, derivCode)
617 sub _pack_override ($$$) {
623 return pack(VCE_TEMPLATE, NON_VAR, @$r);
624 } elsif (defined $r) {
625 return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
627 $u = 0xFFFD if 0x10FFFF < $u;
633 ## list of VCE = getWt(JCPS)
639 my $map = $self->{mapping};
640 my $der = $self->{derivCode};
641 my $out = $self->{overrideOut};
642 my $uXS = $self->{__useXS}; ### XS only
644 return if !defined $u;
645 return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
646 return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
647 $u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
651 @ce = @{ $map->{$u} }; # $u may be a contraction
652 ### begin XS only ###
653 } elsif ($uXS && _exists_simple($u)) {
654 @ce = _fetch_simple($u);
656 } elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
657 my $hang = $self->{overrideHangul};
659 @ce = map _pack_override($_, $u, $der), $hang->($u);
660 } elsif (!defined $hang) {
663 my $max = $self->{maxlength};
664 my @decH = _decompHangul($u);
667 my $contract = join(CODE_SEP, @decH);
668 @decH = ($contract) if $map->{$contract};
669 } else { # must be <@decH == 3>
670 if ($max->{$decH[0]}) {
671 my $contract = join(CODE_SEP, @decH);
672 if ($map->{$contract}) {
675 $contract = join(CODE_SEP, @decH[0,1]);
676 $map->{$contract} and @decH = ($contract, $decH[2]);
678 # even if V's ignorable, LT contraction is not supported.
679 # If such a situation were required, NFD should be used.
681 if (@decH == 3 && $max->{$decH[1]}) {
682 my $contract = join(CODE_SEP, @decH[1,2]);
683 $map->{$contract} and @decH = ($decH[0], $contract);
688 $map->{$_} ? @{ $map->{$_} } :
689 $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
693 } elsif ($out && 0x10FFFF < $u) {
694 @ce = map _pack_override($_, $u, $der), $out->($u);
696 my $cjk = $self->{overrideCJK};
697 my $vers = $self->{UCA_Version};
698 if ($cjk && _isUIdeo($u, $vers)) {
699 @ce = map _pack_override($_, $u, $der), $cjk->($u);
700 } elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
701 @ce = _uideoCE_8($u);
706 return map $self->varCE($_), @ce;
711 ## string sortkey = getSortKey(string arg)
717 my $str = $self->process($orig);
718 my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
719 my $vers = $self->{UCA_Version};
720 my $term = $self->{hangul_terminator};
721 my $lev = $self->{level};
722 my $iden = $self->{identical};
724 my @buf; # weight arrays
727 my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
728 foreach my $jcps (@$rEnt) {
729 # weird things like VL, TL-contraction are not considered!
730 my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
731 if ($preHST && !$curHST || # hangul before non-hangul
732 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
733 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
734 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
738 push @buf, $self->getWt($jcps);
740 push @buf, $termCE if $preHST; # end at hangul
742 foreach my $jcps (@$rEnt) {
743 push @buf, $self->getWt($jcps);
747 my $rkey = $self->mk_SortKey(\@buf); ### XS only
749 if ($iden || $vers >= 26 && $lev == MaxLevel) {
751 $rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
758 ## int compare = cmp(string a, string b)
760 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
761 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
762 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
763 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
764 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
765 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
766 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
769 ## list[strings] sorted = sort(list[strings] arg)
775 sort{ $a->[0] cmp $b->[0] }
776 map [ $obj->getSortKey($_), $_ ], @_;
781 ## bool _nonIgnorAtLevel(arrayref weights, int level)
783 sub _nonIgnorAtLevel($$)
786 return if ! defined $wt;
788 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
793 ## arrayref of arrayref[weights] source,
794 ## arrayref of arrayref[weights] substr,
796 ## * comparison of graphemes vs graphemes.
797 ## @$source >= @$substr must be true (check it before call this);
805 for my $g (0..@$substr-1){
806 # Do the $g'th graphemes have the same number of AV weights?
807 return if @{ $source->[$g] } != @{ $substr->[$g] };
809 for my $w (0..@{ $substr->[$g] }-1) {
810 for my $v (0..$lev-1) {
811 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
819 ## (int position, int length)
820 ## int position = index(string, substring, position, [undoc'ed global])
822 ## With "global" (only for the list context),
823 ## returns list of arrayref[position, length].
828 $self->{preprocess} and
829 croak "Don't use Preprocess with index(), match(), etc.";
830 $self->{normCode} and
831 croak "Don't use Normalization with index(), match(), etc.";
834 my $len = length($str);
836 my $subE = $self->splitEnt($sub);
837 my $pos = @_ ? shift : 0;
838 $pos = 0 if $pos < 0;
841 my $lev = $self->{level};
842 my $v2i = $self->{UCA_Version} >= 9 &&
843 $self->{variable} ne 'non-ignorable';
846 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
848 ? map([$_, 0], $temp..$len)
849 : wantarray ? ($temp,0) : $temp;
852 and return wantarray ? () : NOMATCHPOS;
853 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
855 or return wantarray ? () : NOMATCHPOS;
857 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
859 my $last_is_variable;
860 for my $vwt (map $self->getWt($_), @$subE) {
861 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
862 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
864 # "Ignorable (L1, L2) after Variable" since track. v. 9
867 $last_is_variable = TRUE;
869 elsif (!$wt[0]) { # ignorable
870 $to_be_pushed = FALSE if $last_is_variable;
873 $last_is_variable = FALSE;
877 if (@subWt && !$var && !$wt[0]) {
878 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
879 } elsif ($to_be_pushed) {
880 push @subWt, [ \@wt ];
886 my $end = @$strE - 1;
888 $last_is_variable = FALSE; # reuse
889 for (my $i = 0; $i <= $end; ) { # no $i++
893 while ($i <= $end && $found_base == 0) {
894 for my $vwt ($self->getWt($strE->[$i][0])) {
895 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
896 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
898 # "Ignorable (L1, L2) after Variable" since track. v. 9
901 $last_is_variable = TRUE;
903 elsif (!$wt[0]) { # ignorable
904 $to_be_pushed = FALSE if $last_is_variable;
907 $last_is_variable = FALSE;
911 if (@strWt && !$var && !$wt[0]) {
912 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
913 $finPos[-1] = $strE->[$i][2];
914 } elsif ($to_be_pushed) {
915 push @strWt, [ \@wt ];
916 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
917 $finPos[-1] = NOMATCHPOS if $found_base;
918 push @finPos, $strE->[$i][2];
927 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
928 if ($iniPos[0] != NOMATCHPOS &&
929 $finPos[$#subWt] != NOMATCHPOS &&
930 _eqArray(\@strWt, \@subWt, $lev)) {
931 my $temp = $iniPos[0] + $pos;
934 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
935 splice @strWt, 0, $#subWt;
936 splice @iniPos, 0, $#subWt;
937 splice @finPos, 0, $#subWt;
941 ? ($temp, $finPos[$#subWt] - $iniPos[0])
953 : wantarray ? () : NOMATCHPOS;
957 ## scalarref to matching part = match(string, substring)
962 if (my($pos,$len) = $self->index($_[0], $_[1])) {
963 my $temp = substr($_[0], $pos, $len);
964 return wantarray ? $temp : \$temp;
965 # An lvalue ref \substr should be avoided,
966 # since its value is affected by modification of its referent.
974 ## arrayref matching parts = gmatch(string, substring)
981 return map substr($str, $_->[0], $_->[1]),
982 $self->index($str, $sub, 0, 'g');
986 ## bool subst'ed = subst(string, substring, replace)
991 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
993 if (my($pos,$len) = $self->index($_[0], $_[1])) {
995 my $mat = substr($_[0], $pos, $len);
996 substr($_[0], $pos, $len, $code->($mat));
998 substr($_[0], $pos, $len, $_[2]);
1008 ## int count = gsubst(string, substring, replace)
1013 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1016 # Replacement is carried out from the end, then use reverse.
1017 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1019 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1020 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1022 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1034 Unicode::Collate - Unicode Collation Algorithm
1038 use Unicode::Collate;
1041 $Collator = Unicode::Collate->new(%tailoring);
1044 @sorted = $Collator->sort(@not_sorted);
1047 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1049 B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
1050 according to Perl's Unicode support. See L<perlunicode>,
1051 L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1052 Otherwise you can use C<preprocess> or should decode them before.
1056 This module is an implementation of Unicode Technical Standard #10
1057 (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1059 =head2 Constructor and Tailoring
1061 The C<new> method returns a collator object. If new() is called
1062 with no parameters, the collator should do the default collation.
1064 $Collator = Unicode::Collate->new(
1065 UCA_Version => $UCA_Version,
1066 alternate => $alternate, # alias for 'variable'
1067 backwards => $levelNumber, # or \@levelNumbers
1069 hangul_terminator => $term_primary_weight,
1070 highestFFFF => $bool,
1072 ignoreName => qr/$ignoreName/,
1073 ignoreChar => qr/$ignoreChar/,
1074 ignore_level2 => $bool,
1075 katakana_before_hiragana => $bool,
1076 level => $collationLevel,
1077 long_contraction => $bool,
1078 minimalFFFE => $bool,
1079 normalization => $normalization_form,
1080 overrideCJK => \&overrideCJK,
1081 overrideHangul => \&overrideHangul,
1082 preprocess => \&preprocess,
1083 rearrange => \@charList,
1084 rewrite => \&rewrite,
1085 suppress => \@charList,
1087 undefName => qr/$undefName/,
1088 undefChar => qr/$undefChar/,
1089 upper_before_lower => $bool,
1090 variable => $variable,
1097 If the revision (previously "tracking version") number of UCA is given,
1098 behavior of that revision is emulated on collating.
1099 If omitted, the return value of C<UCA_Version()> is used.
1101 The following revisions are supported. The default is 28.
1103 UCA Unicode Standard DUCET (@version)
1104 -------------------------------------------------------
1105 8 3.1 3.0.1 (3.0.1d9)
1106 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1107 11 4.0 4.0.0 (4.0.0)
1108 14 4.1.0 4.1.0 (4.1.0)
1109 16 5.0 5.0.0 (5.0.0)
1110 18 5.1.0 5.1.0 (5.1.0)
1111 20 5.2.0 5.2.0 (5.2.0)
1112 22 6.0.0 6.0.0 (6.0.0)
1113 24 6.1.0 6.1.0 (6.1.0)
1114 26 6.2.0 6.2.0 (6.2.0)
1115 28 6.3.0 6.3.0 (6.3.0)
1117 * See below C<long_contraction> with C<UCA_Version> 22 and 24.
1119 * Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
1120 since C<UCA_Version> 22.
1122 * Out-of-range codepoints (greater than U+10FFFF) are not ignored,
1123 and can be overridden since C<UCA_Version> 22.
1125 * Fully ignorable characters were ignored, and would not interrupt
1126 contractions with C<UCA_Version> 9 and 11.
1128 * Treatment of ignorables after variables and some behaviors
1129 were changed at C<UCA_Version> 9.
1131 * Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
1132 depend on C<UCA_Version>.
1134 * Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
1135 C<hangul_terminator>.
1139 -- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1141 For backward compatibility, C<alternate> (old name) can be used
1142 as an alias for C<variable>.
1146 -- see 3.4 Backward Accents, UTS #10.
1148 backwards => $levelNumber or \@levelNumbers
1150 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1151 If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
1152 forwards at all the levels.
1156 -- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
1158 If the same character (or a sequence of characters) exists
1159 in the collation element table through C<table>,
1160 mapping to collation elements is overridden.
1161 If it does not exist, the mapping is defined additionally.
1163 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1164 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1165 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1166 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1167 006C 006C ; [.0F4C.0020.0002.006C] # ll
1168 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1169 004C 004C ; [.0F4C.0020.0008.004C] # LL
1170 00F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1171 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1172 00D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1173 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1176 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1177 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1178 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1181 B<NOTE:> The code point in the UCA file format (before C<';'>)
1182 B<must> be a Unicode code point (defined as hexadecimal),
1183 but not a native code point.
1184 So C<0063> must always denote C<U+0063>,
1185 but not a character of C<"\x63">.
1187 Weighting may vary depending on collation element table.
1188 So ensure the weights defined in C<entry> will be consistent with
1189 those in the collation element table loaded via C<table>.
1191 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1192 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1193 (as a value between C<0E60> and C<0E6D>)
1194 makes ordering as C<C E<lt> CH E<lt> D>.
1195 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1196 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1197 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1198 and C<c-curl> (C<U+0255>) with C<0E69>.
1199 Then primary weight C<0E6A> for C<CH> makes C<CH>
1200 ordered between C<c-curl> and C<D>.
1202 =item hangul_terminator
1204 -- see 7.1.4 Trailing Weights, UTS #10.
1206 If a true value is given (non-zero but should be positive),
1207 it will be added as a terminator primary weight to the end of
1208 every standard Hangul syllable. Secondary and any higher weights
1209 for terminator are set to zero.
1210 If the value is false or C<hangul_terminator> key does not exist,
1211 insertion of terminator weights will not be performed.
1213 Boundaries of Hangul syllables are determined
1214 according to conjoining Jamo behavior in F<the Unicode Standard>
1215 and F<HangulSyllableType.txt>.
1217 B<Implementation Note:>
1218 (1) For expansion mapping (Unicode character mapped
1219 to a sequence of collation elements), a terminator will not be added
1220 between collation elements, even if Hangul syllable boundary exists there.
1221 Addition of terminator is restricted to the next position
1222 to the last collation element.
1224 (2) Non-conjoining Hangul letters
1225 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1226 automatically terminated with a terminator primary weight.
1227 These characters may need terminator included in a collation element
1232 -- see 5.14 Collation Elements, UTS #35.
1234 If the parameter is made true, C<U+FFFF> has a highest primary weight.
1235 When a boolean of C<$coll-E<gt>ge($str, "abc")> and
1236 C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
1237 begins with C<"abc">, or another primary equivalent.
1238 C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
1239 such as C<"abc\x{FFFF}xyz">.
1241 C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
1242 almost, but the latter has a problem that you should know which letter is
1243 next to C<c>. For a certain language where C<ch> as the next letter,
1244 C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
1247 This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
1248 Any other character than C<U+FFFF> can be tailored by C<entry>.
1252 -- see A.3 Deterministic Comparison, UTS #10.
1254 By default, strings whose weights are equal should be equal,
1255 even though their code points are not equal.
1256 Completely ignorable characters are ignored.
1258 If the parameter is made true, a final, tie-breaking level is used.
1259 If no difference of weights is found after the comparison through
1260 all the level specified by C<level>, the comparison with code points
1262 For the tie-breaking comparison, the sort key has code points
1263 of the original string appended.
1264 Completely ignorable characters are not ignored.
1266 If C<preprocess> and/or C<normalization> is applied, the code points
1267 of the string after them (in NFD by default) are used.
1273 -- see 3.6 Variable Weighting, UTS #10.
1275 Makes the entry in the table completely ignorable;
1276 i.e. as if the weights were zero at all level.
1278 Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1279 will be ignored. Through C<ignoreName>, any character whose name
1280 (given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1283 E.g. when 'a' and 'e' are ignorable,
1284 'element' is equal to 'lament' (or 'lmnt').
1288 -- see 5.1 Parametric Tailoring, UTS #10.
1290 By default, case-sensitive comparison (that is level 3 difference)
1291 won't ignore accents (that is level 2 difference).
1293 If the parameter is made true, accents (and other primary ignorable
1294 characters) are ignored, even though cases are taken into account.
1296 B<NOTE>: C<level> should be 3 or greater.
1298 =item katakana_before_hiragana
1300 -- see 7.2 Tertiary Weight Table, UTS #10.
1302 By default, hiragana is before katakana.
1303 If the parameter is made true, this is reversed.
1305 B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1306 distinctions must occur in level 3, and their weights at level 3 must be
1307 same as those mentioned in 7.3.1, UTS #10.
1308 If you define your collation elements which violate this requirement,
1309 this parameter does not work validly.
1313 -- see 4.3 Form Sort Key, UTS #10.
1315 Set the maximum level.
1316 Any higher levels than the specified one are ignored.
1318 Level 1: alphabetic ordering
1319 Level 2: diacritic ordering
1320 Level 3: case ordering
1321 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1325 If omitted, the maximum is the 4th.
1327 B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
1328 But this module only uses weights within 0xFFFF.
1329 When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
1330 and 'shift-trimmed'), the level 4 may be unreliable.
1332 See also C<identical>.
1334 =item long_contraction
1336 -- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
1338 If the parameter is made true, for a contraction with three or more
1339 characters (here nicknamed "long contraction"), initial substrings
1341 For example, a contraction ABC, where A is a starter, and B and C
1342 are non-starters (character with non-zero combining character class),
1343 will be detected even if there is not AB as a contraction.
1345 B<Default:> Usually false.
1346 If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
1347 is not specified in C<new()>, a true value is set implicitly.
1348 This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
1350 C<change()> handles C<long_contraction> explicitly only.
1351 If C<long_contraction> is not specified in C<change()>, even though
1352 C<UCA_Version> is changed, C<long_contraction> will not be changed.
1354 B<Limitation:> Scanning non-starters is one-way (no back tracking).
1355 If AB is found but not ABC is not found, other long contraction where
1356 the first character is A and the second is not B may not be found.
1358 Under C<(normalization =E<gt> undef)>, detection step of discontiguous
1359 contractions are skipped.
1361 B<Note:> The following contractions in DUCET are not considered
1362 in steps S2.1.1 to S2.1.3, where they are discontiguous.
1364 0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
1365 0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
1367 For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
1368 (C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
1369 In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
1370 instead of C<0FB2 0F71 0F80>.
1371 Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
1372 contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
1376 -- see 5.14 Collation Elements, UTS #35.
1378 If the parameter is made true, C<U+FFFE> has a minimal primary weight.
1379 The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
1380 first compares C<$a1> and C<$b1> at level 1, and
1381 then C<$a2> and C<$b2> at level 1, as followed.
1397 This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
1398 Any other character than C<U+FFFE> can be tailored by C<entry>.
1402 -- see 4.1 Normalize, UTS #10.
1404 If specified, strings are normalized before preparation of sort keys
1405 (the normalization is executed after preprocess).
1407 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1408 as C<$normalization_form>.
1409 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1410 See C<Unicode::Normalize::normalize()> for detail.
1411 If omitted, C<'NFD'> is used.
1413 C<normalization> is performed after C<preprocess> (if defined).
1415 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1416 though they are not concerned with C<Unicode::Normalize::normalize()>.
1418 If C<undef> (not a string C<"undef">) is passed explicitly
1419 as the value for this key,
1420 any normalization is not carried out (this may make tailoring easier
1421 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1422 only contiguous contractions are resolved;
1423 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1424 C<A-cedilla-ring> would be primary equal to C<A>.
1426 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1427 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1429 In the case of C<(normalization =E<gt> "prenormalized")>,
1430 any normalization is not performed, but
1431 discontiguous contractions with combining characters are performed.
1433 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1434 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1435 If source strings are finely prenormalized,
1436 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1438 Except C<(normalization =E<gt> undef)>,
1439 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1443 -- see 7.1 Derived Collation Elements, UTS #10.
1445 By default, CJK unified ideographs are ordered in Unicode codepoint
1446 order, but those in the CJK Unified Ideographs block are less than
1447 those in the CJK Unified Ideographs Extension A etc.
1449 In the CJK Unified Ideographs block:
1450 U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
1451 U+4E00..U+9FBB if UCA_Version is 14 or 16.
1452 U+4E00..U+9FC3 if UCA_Version is 18.
1453 U+4E00..U+9FCB if UCA_Version is 20 or 22.
1454 U+4E00..U+9FCC if UCA_Version is 24 or later.
1456 In the CJK Unified Ideographs Extension blocks:
1457 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
1458 Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
1459 Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
1461 Through C<overrideCJK>, ordering of CJK unified ideographs (including
1462 extensions) can be overridden.
1464 ex. CJK unified ideographs in the JIS code point order.
1466 overrideCJK => sub {
1467 my $u = shift; # get a Unicode codepoint
1468 my $b = pack('n', $u); # to UTF-16BE
1469 my $s = your_unicode_to_sjis_converter($b); # convert
1470 my $n = unpack('n', $s); # convert sjis to short
1471 [ $n, 0x20, 0x2, $u ]; # return the collation element
1474 The return value may be an arrayref of 1st to 4th weights as shown
1475 above. The return value may be an integer as the primary weight
1476 as shown below. If C<undef> is returned, the default derived
1477 collation element will be used.
1479 overrideCJK => sub {
1480 my $u = shift; # get a Unicode codepoint
1481 my $b = pack('n', $u); # to UTF-16BE
1482 my $s = your_unicode_to_sjis_converter($b); # convert
1483 my $n = unpack('n', $s); # convert sjis to short
1484 return $n; # return the primary weight
1487 The return value may be a list containing zero or more of
1488 an arrayref, an integer, or C<undef>.
1490 ex. ignores all CJK unified ideographs.
1492 overrideCJK => sub {()}, # CODEREF returning empty list
1494 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1495 # as U+4E00 is a CJK unified ideograph and to be ignorable.
1497 If a false value (including C<undef>) is passed, C<overrideCJK>
1499 C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
1501 But assignment of weight for CJK unified ideographs
1502 in C<table> or C<entry> is still valid.
1503 If C<undef> is passed explicitly as the value for this key,
1504 weights for CJK unified ideographs are treated as undefined.
1505 However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
1506 has no special meaning.
1508 B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
1509 C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
1510 C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
1511 ideographs. But they can't be overridden via C<overrideCJK> when you use
1512 DUCET, as the table includes weights for them. C<table> or C<entry> has
1513 priority over C<overrideCJK>.
1515 =item overrideHangul
1517 -- see 7.1 Derived Collation Elements, UTS #10.
1519 By default, Hangul syllables are decomposed into Hangul Jamo,
1520 even if C<(normalization =E<gt> undef)>.
1521 But the mapping of Hangul syllables may be overridden.
1523 This parameter works like C<overrideCJK>, so see there for examples.
1525 If you want to override the mapping of Hangul syllables,
1526 NFD and NFKD are not appropriate, since NFD and NFKD will decompose
1527 Hangul syllables before overriding. FCD may decompose Hangul syllables
1530 If a false value (but not C<undef>) is passed, C<overrideHangul>
1532 C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
1534 If C<undef> is passed explicitly as the value for this key,
1535 weight for Hangul syllables is treated as undefined
1536 without decomposition into Hangul Jamo.
1537 But definition of weight for Hangul syllables
1538 in C<table> or C<entry> is still valid.
1542 -- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
1544 Perl seems to allow out-of-range values (greater than 0x10FFFF).
1545 By default, out-of-range values are replaced with C<U+FFFD>
1546 (REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
1547 or ignored when C<UCA_Version> E<lt>= 20.
1549 When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
1550 can be overridden. Though C<table> or C<entry> are available for them,
1551 out-of-range values are too many.
1553 C<overrideOut> can perform it algorithmically.
1554 This parameter works like C<overrideCJK>, so see there for examples.
1556 ex. ignores all out-of-range values.
1558 overrideOut => sub {()}, # CODEREF returning empty list
1560 If a false value (including C<undef>) is passed, C<overrideOut>
1562 C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
1564 B<NOTE ABOUT U+FFFD:>
1566 UCA recommends that out-of-range values should not be ignored for security
1567 reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
1568 However, C<U+FFFD> is wrongly mapped to a variable collation element
1569 in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
1570 ignored when C<variable> isn't C<Non-ignorable>.
1572 The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
1573 see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
1574 (7.1.4 Trailing Weights). Such a correction is reproduced by this.
1576 overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
1578 This workaround is unnecessary since Unicode 6.3.0.
1582 -- see 5.4 Preprocessing, UTS #10.
1584 If specified, the coderef is used to preprocess each string
1585 before the formation of sort keys.
1587 ex. dropping English articles, such as "a" or "the".
1588 Then, "the pen" is before "a pencil".
1592 $str =~ s/\b(?:an?|the)\s+//gi;
1596 C<preprocess> is performed before C<normalization> (if defined).
1598 ex. decoding strings in a legacy encoding such as shift-jis:
1600 $sjis_collator = Unicode::Collate->new(
1601 preprocess => \&your_shiftjis_to_unicode_decoder,
1603 @result = $sjis_collator->sort(@shiftjis_strings);
1605 B<Note:> Strings returned from the coderef will be interpreted
1606 according to Perl's Unicode support. See L<perlunicode>,
1607 L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
1611 -- see 3.5 Rearrangement, UTS #10.
1613 Characters that are not coded in logical order and to be rearranged.
1614 If C<UCA_Version> is equal to or less than 11, default is:
1616 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1618 If you want to disallow any rearrangement, pass C<undef> or C<[]>
1619 (a reference to empty list) as the value for this key.
1621 If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1622 (i.e. no rearrangement).
1624 B<According to the version 9 of UCA, this parameter shall not be used;
1625 but it is not warned at present.>
1629 If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
1630 The coderef will get each line, and then should return a rewritten line
1631 according to the UCA file format.
1632 If the coderef returns an empty line, the line will be skipped.
1634 e.g. any primary ignorable characters into tertiary ignorable:
1638 $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
1642 This example shows rewriting weights. C<rewrite> is allowed to
1643 affect code points, weights, and the name.
1645 B<NOTE>: C<table> is available to use another table file;
1646 preparing a modified table once would be more efficient than
1647 rewriting lines on reading an unmodified table every time.
1651 -- see suppress contractions in 5.14.11 Special-Purpose Commands,
1654 Contractions beginning with the specified characters are suppressed,
1655 even if those contractions are defined in C<table>.
1657 An example for Russian and some languages using the Cyrillic script:
1659 suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
1661 where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
1663 B<NOTE>: Contractions via C<entry> are not be suppressed.
1667 -- see 3.8 Default Unicode Collation Element Table, UTS #10.
1669 You can use another collation element table if desired.
1671 The table file should locate in the F<Unicode/Collate> directory
1672 on C<@INC>. Say, if the filename is F<Foo.txt>,
1673 the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1675 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1676 If you will prepare your own table file, any name other than F<allkeys.txt>
1677 may be better to avoid namespace conflict.
1679 B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1680 module, and it may save time at the run time.
1681 Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
1682 or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
1683 C<rewrite> will prevent this module from using the compiled DUCET.
1685 If C<undef> is passed explicitly as the value for this key,
1686 no file is read (but you can define collation elements via C<entry>).
1688 A typical way to define a collation element table
1689 without any file of table:
1691 $onlyABC = Unicode::Collate->new(
1693 entry => << 'ENTRIES',
1694 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1695 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1696 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1697 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1698 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1699 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1703 If C<ignoreName> or C<undefName> is used, character names should be
1704 specified as a comment (following C<#>) on each line.
1710 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1712 Undefines the collation element as if it were unassigned in the C<table>.
1713 This reduces the size of the table.
1714 If an unassigned character appears in the string to be collated,
1715 the sort key is made from its codepoint
1716 as a single-character collation element,
1717 as it is greater than any other assigned collation elements
1718 (in the codepoint order among the unassigned characters).
1719 But, it'd be better to ignore characters
1720 unfamiliar to you and maybe never used.
1722 Through C<undefChar>, any character matching C<qr/$undefChar/>
1723 will be undefined. Through C<undefName>, any character whose name
1724 (given in the C<table> file as a comment) matches C<qr/$undefName/>
1727 ex. Collation weights for beyond-BMP characters are not stored in object:
1729 undefChar => qr/[^\0-\x{fffd}]/,
1731 =item upper_before_lower
1733 -- see 6.6 Case Comparisons, UTS #10.
1735 By default, lowercase is before uppercase.
1736 If the parameter is made true, this is reversed.
1738 B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1739 distinctions must occur in level 3, and their weights at level 3 must be
1740 same as those mentioned in 7.3.1, UTS #10.
1741 If you define your collation elements which differs from this requirement,
1742 this parameter doesn't work validly.
1746 -- see 3.6 Variable Weighting, UTS #10.
1748 This key allows for variable weighting of variable collation elements,
1749 which are marked with an ASTERISK in the table
1750 (NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
1752 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1754 These names are case-insensitive.
1755 By default (if specification is omitted), 'shifted' is adopted.
1757 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1758 considered at the 4th level.
1760 'Non-Ignorable' Variable elements are not reset to ignorable.
1762 'Shifted' Variable elements are made ignorable at levels 1 through 3
1763 their level 4 weight is replaced by the old level 1 weight.
1764 Level 4 weight for Non-Variable elements is 0xFFFF.
1766 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1771 =head2 Methods for Collation
1775 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1777 Sorts a list of strings.
1779 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1781 Returns 1 (when C<$a> is greater than C<$b>)
1782 or 0 (when C<$a> is equal to C<$b>)
1783 or -1 (when C<$a> is less than C<$b>).
1785 =item C<$result = $Collator-E<gt>eq($a, $b)>
1787 =item C<$result = $Collator-E<gt>ne($a, $b)>
1789 =item C<$result = $Collator-E<gt>lt($a, $b)>
1791 =item C<$result = $Collator-E<gt>le($a, $b)>
1793 =item C<$result = $Collator-E<gt>gt($a, $b)>
1795 =item C<$result = $Collator-E<gt>ge($a, $b)>
1797 They works like the same name operators as theirs.
1799 eq : whether $a is equal to $b.
1800 ne : whether $a is not equal to $b.
1801 lt : whether $a is less than $b.
1802 le : whether $a is less than $b or equal to $b.
1803 gt : whether $a is greater than $b.
1804 ge : whether $a is greater than $b or equal to $b.
1806 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1808 -- see 4.3 Form Sort Key, UTS #10.
1812 You compare the sort keys using a binary comparison
1813 and get the result of the comparison of the strings using UCA.
1815 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1819 $Collator->cmp($a, $b)
1821 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1823 Converts a sorting key into its representation form.
1824 If C<UCA_Version> is 8, the output is slightly different.
1826 use Unicode::Collate;
1827 my $c = Unicode::Collate->new();
1828 print $c->viewSortKey("Perl"),"\n";
1831 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1832 # Level 1 Level 2 Level 3 Level 4
1836 =head2 Methods for Searching
1838 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1839 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1840 but they are not aware of any pattern, but only a literal substring.
1842 B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1843 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1844 C<subst>, C<gsubst>) is croaked, as the position and the length might
1845 differ from those on the specified string.
1847 C<rearrange> and C<hangul_terminator> parameters are neglected.
1848 C<katakana_before_hiragana> and C<upper_before_lower> don't affect
1849 matching and searching, as it doesn't matter whether greater or less.
1853 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1855 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1857 If C<$substring> matches a part of C<$string>, returns
1858 the position of the first occurrence of the matching part in scalar context;
1859 in list context, returns a two-element list of
1860 the position and the length of the matching part.
1862 If C<$substring> does not match any part of C<$string>,
1863 returns C<-1> in scalar context and
1864 an empty list in list context.
1866 e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
1867 you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
1869 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1870 # (normalization => undef) is REQUIRED.
1872 if (my($pos,$len) = $Collator->index($str, $sub)) {
1873 $match = substr($str, $pos, $len);
1876 and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
1877 is primary equal to C<"M>E<uuml>C<SS">.
1879 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1881 =item C<($match) = $Collator-E<gt>match($string, $substring)>
1883 If C<$substring> matches a part of C<$string>, in scalar context, returns
1884 B<a reference to> the first occurrence of the matching part
1885 (C<$match_ref> is always true if matches,
1886 since every reference is B<true>);
1887 in list context, returns the first occurrence of the matching part.
1889 If C<$substring> does not match any part of C<$string>,
1890 returns C<undef> in scalar context and
1891 an empty list in list context.
1895 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1896 print "matches [$$match_ref].\n";
1898 print "doesn't match.\n";
1903 if (($match) = $Collator->match($str, $sub)) { # list context
1904 print "matches [$match].\n";
1906 print "doesn't match.\n";
1909 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1911 If C<$substring> matches a part of C<$string>, returns
1912 all the matching parts (or matching count in scalar context).
1914 If C<$substring> does not match any part of C<$string>,
1915 returns an empty list.
1917 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1919 If C<$substring> matches a part of C<$string>,
1920 the first occurrence of the matching part is replaced by C<$replacement>
1921 (C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
1923 C<$replacement> can be a C<CODEREF>,
1924 taking the matching part as an argument,
1925 and returning a string to replace the matching part
1926 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1928 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1930 If C<$substring> matches a part of C<$string>,
1931 all the occurrences of the matching part are replaced by C<$replacement>
1932 (C<$string> is modified) and C<$count> is returned.
1934 C<$replacement> can be a C<CODEREF>,
1935 taking the matching part as an argument,
1936 and returning a string to replace the matching part
1937 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1941 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1942 # (normalization => undef) is REQUIRED.
1943 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
1944 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1946 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
1947 # i.e., all the camels are made bold-faced.
1949 Examples: levels and ignore_level2 - what does camel match?
1950 ---------------------------------------------------------------------------
1951 level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l
1952 -----------------------|---------------------------------------------------
1953 1 false | yes yes yes yes yes
1954 2 false | yes yes no yes yes
1955 3 false | yes no no yes yes
1956 4 false | yes no no no yes
1957 -----------------------|---------------------------------------------------
1958 1 true | yes yes yes yes yes
1959 2 true | yes yes yes yes yes
1960 3 true | yes no yes yes yes
1961 4 true | yes no yes no yes
1962 ---------------------------------------------------------------------------
1963 note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
1968 =head2 Other Methods
1972 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1974 =item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
1976 Changes the value of specified keys and returns the changed part.
1978 $Collator = Unicode::Collate->new(level => 4);
1980 $Collator->eq("perl", "PERL"); # false
1982 %old = $Collator->change(level => 2); # returns (level => 4).
1984 $Collator->eq("perl", "PERL"); # true
1986 $Collator->change(%old); # returns (level => 2).
1988 $Collator->eq("perl", "PERL"); # false
1990 Not all C<(key,value)>s are allowed to be changed.
1991 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1993 In the scalar context, returns the modified collator
1994 (but it is B<not> a clone from the original).
1996 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1998 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
2000 $Collator->change(level => 4)->eq("perl", "PERL"); # false
2002 =item C<$version = $Collator-E<gt>version()>
2004 Returns the version number (a string) of the Unicode Standard
2005 which the C<table> file used by the collator object is based on.
2006 If the table does not include a version line (starting with C<@version>),
2007 returns C<"unknown">.
2009 =item C<UCA_Version()>
2011 Returns the revision number of UTS #10 this module consults,
2012 that should correspond with the DUCET incorporated.
2014 =item C<Base_Unicode_Version()>
2016 Returns the version number of UTS #10 this module consults,
2017 that should correspond with the DUCET incorporated.
2023 No method will be exported.
2027 Though this module can be used without any C<table> file,
2028 to use this module easily, it is recommended to install a table file
2029 in the UCA format, by copying it under the directory
2030 <a place in @INC>/Unicode/Collate.
2032 The most preferable one is "The Default Unicode Collation Element Table"
2033 (aka DUCET), available from the Unicode Consortium's website:
2035 http://www.unicode.org/Public/UCA/
2037 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
2039 If DUCET is not installed, it is recommended to copy the file
2040 from http://www.unicode.org/Public/UCA/latest/allkeys.txt
2041 to <a place in @INC>/Unicode/Collate/allkeys.txt
2050 Use of the C<normalization> parameter requires the B<Unicode::Normalize>
2051 module (see L<Unicode::Normalize>).
2053 If you need not it (say, in the case when you need not
2054 handle any combining characters),
2055 assign C<(normalization =E<gt> undef)> explicitly.
2057 -- see 6.5 Avoiding Normalization, UTS #10.
2059 =item Conformance Test
2061 The Conformance Test for the UCA is available
2062 under L<http://www.unicode.org/Public/UCA/>.
2064 For F<CollationTest_SHIFTED.txt>,
2065 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
2066 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
2067 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
2069 If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
2070 C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
2071 C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
2072 C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
2074 B<Unicode::Normalize is required to try The Conformance Test.>
2078 =head1 AUTHOR, COPYRIGHT AND LICENSE
2080 The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
2081 <SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2014,
2082 SADAHIRO Tomoyuki. Japan. All rights reserved.
2084 This module is free software; you can redistribute it and/or
2085 modify it under the same terms as Perl itself.
2087 The file Unicode/Collate/allkeys.txt was copied verbatim
2088 from L<http://www.unicode.org/Public/UCA/6.3.0/allkeys.txt>.
2089 For this file, Copyright (c) 2001-2012 Unicode, Inc.
2090 Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
2096 =item Unicode Collation Algorithm - UTS #10
2098 L<http://www.unicode.org/reports/tr10/>
2100 =item The Default Unicode Collation Element Table (DUCET)
2102 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
2104 =item The conformance test for the UCA
2106 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
2108 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
2110 =item Hangul Syllable Type
2112 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
2114 =item Unicode Normalization Forms - UAX #15
2116 L<http://www.unicode.org/reports/tr15/>
2118 =item Unicode Locale Data Markup Language (LDML) - UTS #35
2120 L<http://www.unicode.org/reports/tr35/>