1 package Unicode::Collate;
4 unless ("A" eq pack('U', 0x41)) {
5 die "Unicode::Collate cannot stringify a Unicode code point\n";
17 our $VERSION = '0.56';
18 our $PACKAGE = __PACKAGE__;
20 my @Path = qw(Unicode Collate);
21 my $KeyFile = "allkeys.txt";
24 use constant TRUE => 1;
25 use constant FALSE => "";
26 use constant NOMATCHPOS => -1;
28 # A coderef to get combining class imported from Unicode::Normalize
29 # (i.e. \&Unicode::Normalize::getCombinClass).
30 # This is also used as a HAS_UNICODE_NORMALIZE flag.
34 use constant MinLevel => 1;
35 use constant MaxLevel => 4;
37 # Minimum weights at level 2 and 3, respectively
38 use constant Min2Wt => 0x20;
39 use constant Min3Wt => 0x02;
41 # Shifted weight at 4th level
42 use constant Shift4Wt => 0xFFFF;
44 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
45 # PROBLEM: The Default Unicode Collation Element Table
46 # has weights over 0xFFFF at the 4th level.
47 # The tie-breaking in the variable weights
48 # other than "shift" (as well as "shift-trimmed") is unreliable.
49 use constant VCE_TEMPLATE => 'Cn4';
51 # A sort key: 16-bit weights
52 # See also the PROBLEM on VCE_TEMPLATE above.
53 use constant KEY_TEMPLATE => 'n*';
55 # Level separator in a sort key:
56 # i.e. pack(KEY_TEMPLATE, 0)
57 use constant LEVEL_SEP => "\0\0";
59 # As Unicode code point separator for hash keys.
60 # A joined code point string (denoted by JCPS below)
61 # like "65;768" is used for internal processing
62 # instead of Perl's Unicode string like "\x41\x{300}",
63 # as the native code point is different from the Unicode code point
65 # This character must not be included in any stringified
66 # representation of an integer.
67 use constant CODE_SEP => ';';
69 # boolean values of variable weights
70 use constant NON_VAR => 0; # Non-Variable character
71 use constant VAR => 1; # Variable character
73 # specific code points
74 use constant Hangul_SBase => 0xAC00;
75 use constant Hangul_SIni => 0xAC00;
76 use constant Hangul_SFin => 0xD7A3;
77 use constant Hangul_NCount => 588;
78 use constant Hangul_TCount => 28;
79 use constant Hangul_LBase => 0x1100;
80 use constant Hangul_LIni => 0x1100;
81 use constant Hangul_LFin => 0x1159;
82 use constant Hangul_LFill => 0x115F;
83 use constant Hangul_LEnd => 0x115F; # Unicode 5.2.0
84 use constant Hangul_VBase => 0x1161;
85 use constant Hangul_VIni => 0x1160; # from Vowel Filler
86 use constant Hangul_VFin => 0x11A2;
87 use constant Hangul_VEnd => 0x11A7; # Unicode 5.2.0
88 use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
89 use constant Hangul_TIni => 0x11A8;
90 use constant Hangul_TFin => 0x11F9;
91 use constant Hangul_TEnd => 0x11FF; # Unicode 5.2.0
92 use constant HangulL2Ini => 0xA960; # Unicode 5.2.0
93 use constant HangulL2Fin => 0xA97C; # Unicode 5.2.0
94 use constant HangulV2Ini => 0xD7B0; # Unicode 5.2.0
95 use constant HangulV2Fin => 0xD7C6; # Unicode 5.2.0
96 use constant HangulT2Ini => 0xD7CB; # Unicode 5.2.0
97 use constant HangulT2Fin => 0xD7FB; # Unicode 5.2.0
99 use constant CJK_UidIni => 0x4E00;
100 use constant CJK_UidFin => 0x9FA5;
101 use constant CJK_UidF41 => 0x9FBB;
102 use constant CJK_UidF51 => 0x9FC3;
103 use constant CJK_UidF52 => 0x9FCB;
104 use constant CJK_ExtAIni => 0x3400;
105 use constant CJK_ExtAFin => 0x4DB5;
106 use constant CJK_ExtBIni => 0x20000;
107 use constant CJK_ExtBFin => 0x2A6D6;
108 use constant CJK_ExtCIni => 0x2A700; # Unicode 5.2.0
109 use constant CJK_ExtCFin => 0x2B734; # Unicode 5.2.0
111 # Logical_Order_Exception in PropList.txt
112 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
114 sub UCA_Version { "20" }
116 sub Base_Unicode_Version { "5.2.0" }
121 return pack('U*', @_);
125 return unpack('U*', shift(@_).pack('U*'));
132 blanked non-ignorable shifted shift-trimmed
133 / } = (); # keys lowercased
136 alternate backwards level normalization rearrange
137 katakana_before_hiragana upper_before_lower
138 overrideHangul overrideCJK preprocess UCA_Version
139 hangul_terminator variable
143 entry mapping table maxlength
144 ignoreChar ignoreName undefChar undefName variableTable
145 versionTable alternateTable backwardsTable forwardsTable rearrangeTable
146 derivCode normCode rearrangeHash backwardsFlag
148 # The hash key 'ignored' is deleted at v 0.21.
149 # The hash key 'isShift' is deleted at v 0.23.
150 # The hash key 'combining' is deleted at v 0.24.
151 # The hash key 'entries' is deleted at v 0.30.
152 # The hash key 'L3_ignorable' is deleted at v 0.40.
156 return $self->{versionTable} || 'unknown';
159 my (%ChangeOK, %ChangeNG);
160 @ChangeOK{ @ChangeOK } = ();
161 @ChangeNG{ @ChangeNG } = ();
167 if (exists $hash{variable} && exists $hash{alternate}) {
168 delete $hash{alternate};
170 elsif (!exists $hash{variable} && exists $hash{alternate}) {
171 $hash{variable} = $hash{alternate};
173 foreach my $k (keys %hash) {
174 if (exists $ChangeOK{$k}) {
175 $old{$k} = $self->{$k};
176 $self->{$k} = $hash{$k};
178 elsif (exists $ChangeNG{$k}) {
179 croak "change of $k via change() is not allowed!";
183 $self->checkCollator();
184 return wantarray ? %old : $self;
189 my $key = shift; # 'level' or 'backwards'
190 MinLevel <= $level or croak sprintf
191 "Illegal level %d (in value for key '%s') lower than %d.",
192 $level, $key, MinLevel;
193 $level <= MaxLevel or croak sprintf
194 "Unsupported level %d (in value for key '%s') higher than %d.",
195 $level, $key, MaxLevel;
201 11 => \&_derivCE_9, # 11 == 9
203 16 => \&_derivCE_14, # 16 == 14
210 _checkLevel($self->{level}, "level");
212 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
213 or croak "Illegal UCA version (passed $self->{UCA_Version}).";
215 $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
216 $self->{alternateTable} || 'shifted';
217 $self->{variable} = $self->{alternate} = lc($self->{variable});
218 exists $VariableOK{ $self->{variable} }
219 or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
221 if (! defined $self->{backwards}) {
222 $self->{backwardsFlag} = 0;
224 elsif (! ref $self->{backwards}) {
225 _checkLevel($self->{backwards}, "backwards");
226 $self->{backwardsFlag} = 1 << $self->{backwards};
230 $self->{backwardsFlag} = 0;
231 for my $b (@{ $self->{backwards} }) {
232 _checkLevel($b, "backwards");
235 for my $v (sort keys %level) {
236 $self->{backwardsFlag} += 1 << $v;
240 defined $self->{rearrange} or $self->{rearrange} = [];
241 ref $self->{rearrange}
242 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
244 # keys of $self->{rearrangeHash} are $self->{rearrange}.
245 $self->{rearrangeHash} = undef;
247 if (@{ $self->{rearrange} }) {
248 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
251 $self->{normCode} = undef;
253 if (defined $self->{normalization}) {
254 eval { require Unicode::Normalize };
255 $@ and croak "Unicode::Normalize is required to normalize strings";
257 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
259 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
260 $self->{normCode} = \&Unicode::Normalize::NFD;
262 elsif ($self->{normalization} ne 'prenormalized') {
263 my $norm = $self->{normalization};
264 $self->{normCode} = sub {
265 Unicode::Normalize::normalize($norm, shift);
267 eval { $self->{normCode}->("") }; # try
268 $@ and croak "$PACKAGE unknown normalization form name: $norm";
277 my $self = bless { @_ }, $class;
279 # If undef is passed explicitly, no file is read.
280 $self->{table} = $KeyFile if ! exists $self->{table};
281 $self->read_table() if defined $self->{table};
283 if ($self->{entry}) {
284 while ($self->{entry} =~ /([^\n]+)/g) {
285 $self->parseEntry($1);
289 $self->{level} ||= MaxLevel;
290 $self->{UCA_Version} ||= UCA_Version();
292 $self->{overrideHangul} = FALSE
293 if ! exists $self->{overrideHangul};
294 $self->{overrideCJK} = FALSE
295 if ! exists $self->{overrideCJK};
296 $self->{normalization} = 'NFD'
297 if ! exists $self->{normalization};
298 $self->{rearrange} = $self->{rearrangeTable} ||
299 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
300 if ! exists $self->{rearrange};
301 $self->{backwards} = $self->{backwardsTable}
302 if ! exists $self->{backwards};
304 $self->checkCollator();
311 my $line = shift; # after s/^\s*\@//
313 if ($line =~ /^version\s*(\S*)/) {
314 $self->{versionTable} ||= $1;
316 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
317 $self->{variableTable} ||= $1;
319 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
320 $self->{alternateTable} ||= $1;
322 elsif ($line =~ /^backwards\s+(\S*)/) {
323 push @{ $self->{backwardsTable} }, $1;
325 elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
326 push @{ $self->{forwardsTable} }, $1;
328 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
329 push @{ $self->{rearrangeTable} }, _getHexArray($1);
337 foreach my $d (@INC) {
338 $f = File::Spec->catfile($d, @Path, $self->{table});
339 last if open($fh, $f);
343 $f = File::Spec->catfile(@Path, $self->{table});
344 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
347 while (my $line = <$fh>) {
348 next if $line =~ /^\s*#/;
350 if ($line =~ s/^\s*\@//) {
351 $self->parseAtmark($line);
353 $self->parseEntry($line);
361 ## get $line, parse it, and write an entry in $self
367 my($name, $entry, @uv, @key);
369 return if $line !~ /^\s*[0-9A-Fa-f]/;
371 # removes comment and gets name
373 if $line =~ s/[#%]\s*(.*)//;
374 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
377 my($e, $k) = split /;/, $line;
378 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
381 @uv = _getHexArray($e);
384 $entry = join(CODE_SEP, @uv); # in JCPS
386 if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
387 my $ele = pack_U(@uv);
389 # regarded as if it were not entried in the table
391 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
393 # replaced as completely ignorable
394 $k = '[.0000.0000.0000.0000]'
395 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
398 # replaced as completely ignorable
399 $k = '[.0000.0000.0000.0000]'
400 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
402 my $is_L3_ignorable = TRUE;
404 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
405 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
406 my @wt = _getHexArray($arr);
407 push @key, pack(VCE_TEMPLATE, $var, @wt);
408 $is_L3_ignorable = FALSE
409 if $wt[0] || $wt[1] || $wt[2];
410 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
411 # is completely ignorable.
412 # For expansion, an entry $is_L3_ignorable
413 # if and only if "all" CEs are [.0000.0000.0000].
416 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
419 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
420 and $self->{maxlength}{$uv[0]} = @uv;
426 ## VCE = _varCE(variable term, VCE)
432 if ($vbl eq 'non-ignorable') {
435 my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
438 return pack(VCE_TEMPLATE, $var, 0, 0, 0,
439 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
441 elsif ($vbl eq 'blanked') {
445 return pack(VCE_TEMPLATE, $var, @wt[0..2],
446 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
453 $self->visualizeSortKey($self->getSortKey(@_));
459 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
461 if ($self->{UCA_Version} <= 8) {
462 $view =~ s/ ?0000 ?/|/g;
464 $view =~ s/\b0000\b/|/g;
471 ## arrayref of JCPS = splitEnt(string to be collated)
472 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
479 my $code = $self->{preprocess};
480 my $norm = $self->{normCode};
481 my $map = $self->{mapping};
482 my $max = $self->{maxlength};
483 my $reH = $self->{rearrangeHash};
484 my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
489 $code and croak "Preprocess breaks character positions. "
490 . "Don't use with index(), match(), etc.";
491 $norm and croak "Normalization breaks character positions. "
492 . "Don't use with index(), match(), etc.";
497 $str = &$code($str) if ref $code;
498 $str = &$norm($str) if ref $norm;
501 # get array of Unicode code point of string.
502 my @src = unpack_U($str);
505 # Character positions are not kept if rearranged,
506 # then neglected if $wLen is true.
507 if ($reH && ! $wLen) {
508 for (my $i = 0; $i < @src; $i++) {
509 if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
510 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
516 # remove a code point marked as a completely ignorable.
517 for (my $i = 0; $i < @src; $i++) {
519 if _isIllegal($src[$i]) || ($ver9 &&
520 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
523 for (my $i = 0; $i < @src; $i++) {
526 # skip removed code point
527 if (! defined $jcps) {
529 $buf[-1][2] = $i + 1;
538 my $temp_jcps = $jcps;
540 my $maxLen = $max->{$jcps};
542 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
543 next if ! defined $src[$p];
544 $temp_jcps .= CODE_SEP . $src[$p];
546 if ($map->{$temp_jcps}) {
552 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
553 # This process requires Unicode::Normalize.
554 # If "normalization" is undef, here should be skipped *always*
555 # (in spite of bool value of $CVgetCombinClass),
556 # since canonical ordering cannot be expected.
557 # Blocked combining character should not be contracted.
559 if ($self->{normalization})
560 # $self->{normCode} is false in the case of "prenormalized".
565 for (my $p = $i + 1; $p < @src; $p++) {
566 next if ! defined $src[$p];
567 $curCC = $CVgetCombinClass->($src[$p]);
569 my $tail = CODE_SEP . $src[$p];
570 if ($preCC != $curCC && $map->{$jcps.$tail}) {
580 # skip completely ignorable
581 if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
583 $buf[-1][2] = $i + 1;
588 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
595 ## list of VCE = getWt(JCPS)
601 my $vbl = $self->{variable};
602 my $map = $self->{mapping};
603 my $der = $self->{derivCode};
605 return if !defined $u;
606 return map(_varCE($vbl, $_), @{ $map->{$u} })
609 # JCPS must not be a contraction, then it's a code point.
610 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
611 my $hang = $self->{overrideHangul};
614 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
616 elsif (!defined $hang) {
617 @hangulCE = $der->($u);
620 my $max = $self->{maxlength};
621 my @decH = _decompHangul($u);
624 my $contract = join(CODE_SEP, @decH);
625 @decH = ($contract) if $map->{$contract};
626 } else { # must be <@decH == 3>
627 if ($max->{$decH[0]}) {
628 my $contract = join(CODE_SEP, @decH);
629 if ($map->{$contract}) {
632 $contract = join(CODE_SEP, @decH[0,1]);
633 $map->{$contract} and @decH = ($contract, $decH[2]);
635 # even if V's ignorable, LT contraction is not supported.
636 # If such a situatution were required, NFD should be used.
638 if (@decH == 3 && $max->{$decH[1]}) {
639 my $contract = join(CODE_SEP, @decH[1,2]);
640 $map->{$contract} and @decH = ($decH[0], $contract);
645 $map->{$_} ? @{ $map->{$_} } : $der->($_);
648 return map _varCE($vbl, $_), @hangulCE;
650 elsif (_isUIdeo($u, $self->{UCA_Version})) {
651 my $cjk = $self->{overrideCJK};
652 return map _varCE($vbl, $_),
654 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
655 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
660 return map _varCE($vbl, $_), $der->($u);
666 ## string sortkey = getSortKey(string arg)
671 my $lev = $self->{level};
672 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
673 my $vers = $self->{UCA_Version};
674 my $v2i = $vers >= 9 && $self->{variable} ne 'non-ignorable';
676 my @buf; # weight arrays
677 if ($self->{hangul_terminator}) {
679 foreach my $jcps (@$rEnt) {
680 # weird things like VL, TL-contraction are not considered!
682 foreach my $u (split /;/, $jcps) {
683 $curHST .= getHST($u, $vers);
685 if ($preHST && !$curHST || # hangul before non-hangul
686 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
687 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
688 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
690 push @buf, $self->getWtHangulTerm();
694 push @buf, $self->getWt($jcps);
696 $preHST # end at hangul
697 and push @buf, $self->getWtHangulTerm();
700 foreach my $jcps (@$rEnt) {
701 push @buf, $self->getWt($jcps);
706 my @ret = ([],[],[],[]);
707 my $last_is_variable;
709 foreach my $vwt (@buf) {
710 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
712 # "Ignorable (L1, L2) after Variable" since track. v. 9
715 $last_is_variable = TRUE;
717 elsif (!$wt[0]) { # ignorable
718 next if $last_is_variable;
721 $last_is_variable = FALSE;
724 foreach my $v (0..$lev-1) {
725 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
729 # modification of tertiary weights
730 if ($self->{upper_before_lower}) {
731 foreach my $w (@{ $ret[2] }) {
732 if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
733 elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
734 elsif ($w == 0x1C) { $w += 1 } # square upper
735 elsif ($w == 0x1D) { $w -= 1 } # square lower
738 if ($self->{katakana_before_hiragana}) {
739 foreach my $w (@{ $ret[2] }) {
740 if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
741 elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
745 if ($self->{backwardsFlag}) {
746 for (my $v = MinLevel; $v <= MaxLevel; $v++) {
747 if ($self->{backwardsFlag} & (1 << $v)) {
748 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
753 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
758 ## int compare = cmp(string a, string b)
760 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
761 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
762 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
763 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
764 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
765 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
766 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
769 ## list[strings] sorted = sort(list[strings] arg)
775 sort{ $a->[0] cmp $b->[0] }
776 map [ $obj->getSortKey($_), $_ ], @_;
782 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52) ? 0xFB40 : # CJK
783 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
784 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
785 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) ? 0xFB80 # CJK ext.
787 my $aaaa = $base + ($u >> 15);
788 my $bbbb = ($u & 0x7FFF) | 0x8000;
789 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
790 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
795 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF51) ? 0xFB40 : # CJK
796 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
797 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80 # CJK ext.
799 my $aaaa = $base + ($u >> 15);
800 my $bbbb = ($u & 0x7FFF) | 0x8000;
801 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
802 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
807 my $base = (CJK_UidIni <= $u && $u <= CJK_UidF41) ? 0xFB40 : # CJK
808 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
809 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80 # CJK ext.
811 my $aaaa = $base + ($u >> 15);
812 my $bbbb = ($u & 0x7FFF) | 0x8000;
813 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
814 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
819 my $base = (CJK_UidIni <= $u && $u <= CJK_UidFin) ? 0xFB40 : # CJK
820 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
821 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80 # CJK ext.
823 my $aaaa = $base + ($u >> 15);
824 my $bbbb = ($u & 0x7FFF) | 0x8000;
825 return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
826 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
831 my $aaaa = 0xFF80 + ($code >> 15);
832 my $bbbb = ($code & 0x7FFF) | 0x8000;
834 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
835 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
840 return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
844 my ($u, $uca_vers) = @_;
845 return((CJK_UidIni <= $u && (
846 $uca_vers >= 20 ? ($u <= CJK_UidF52) :
847 $uca_vers >= 18 ? ($u <= CJK_UidF51) :
848 $uca_vers >= 14 ? ($u <= CJK_UidF41) :
851 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
853 (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
856 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
861 sub getWtHangulTerm {
863 return _varCE($self->{variable},
864 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
869 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
871 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
874 # $code *must* be in Hangul syllable.
875 # Check it before you enter here.
879 my $si = $code - Hangul_SBase;
880 my $li = int( $si / Hangul_NCount);
881 my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
882 my $ti = $si % Hangul_TCount;
886 $ti ? (Hangul_TBase + $ti) : (),
892 return ! defined $code # removed
893 || ($code < 0 || 0x10FFFF < $code) # out of range
894 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
895 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
896 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
900 # Hangul Syllable Type
903 my $vers = shift || 0;
905 if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
906 return +($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV";
910 return Hangul_LIni <= $u && $u <= Hangul_LFin ||
911 $u == Hangul_LFill ? "L" :
912 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
913 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" : "";
915 return Hangul_LIni <= $u && $u <= Hangul_LEnd ||
916 HangulL2Ini <= $u && $u <= HangulL2Fin ? "L" :
917 Hangul_VIni <= $u && $u <= Hangul_VEnd ||
918 HangulV2Ini <= $u && $u <= HangulV2Fin ? "V" :
919 Hangul_TIni <= $u && $u <= Hangul_TEnd ||
920 HangulT2Ini <= $u && $u <= HangulT2Fin ? "T" : "";
926 ## bool _nonIgnorAtLevel(arrayref weights, int level)
928 sub _nonIgnorAtLevel($$)
931 return if ! defined $wt;
933 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
938 ## arrayref of arrayref[weights] source,
939 ## arrayref of arrayref[weights] substr,
941 ## * comparison of graphemes vs graphemes.
942 ## @$source >= @$substr must be true (check it before call this);
950 for my $g (0..@$substr-1){
951 # Do the $g'th graphemes have the same number of AV weigths?
952 return if @{ $source->[$g] } != @{ $substr->[$g] };
954 for my $w (0..@{ $substr->[$g] }-1) {
955 for my $v (0..$lev-1) {
956 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
964 ## (int position, int length)
965 ## int position = index(string, substring, position, [undoc'ed grobal])
967 ## With "grobal" (only for the list context),
968 ## returns list of arrayref[position, length].
974 my $len = length($str);
975 my $subE = $self->splitEnt(shift);
976 my $pos = @_ ? shift : 0;
977 $pos = 0 if $pos < 0;
980 my $lev = $self->{level};
981 my $v2i = $self->{UCA_Version} >= 9 &&
982 $self->{variable} ne 'non-ignorable';
985 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
987 ? map([$_, 0], $temp..$len)
988 : wantarray ? ($temp,0) : $temp;
991 and return wantarray ? () : NOMATCHPOS;
992 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
994 or return wantarray ? () : NOMATCHPOS;
996 my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
998 my $last_is_variable;
999 for my $vwt (map $self->getWt($_), @$subE) {
1000 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
1001 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
1003 # "Ignorable (L1, L2) after Variable" since track. v. 9
1006 $last_is_variable = TRUE;
1008 elsif (!$wt[0]) { # ignorable
1009 $to_be_pushed = FALSE if $last_is_variable;
1012 $last_is_variable = FALSE;
1016 if (@subWt && !$var && !$wt[0]) {
1017 push @{ $subWt[-1] }, \@wt if $to_be_pushed;
1019 push @subWt, [ \@wt ];
1024 my $end = @$strE - 1;
1026 $last_is_variable = FALSE; # reuse
1027 for (my $i = 0; $i <= $end; ) { # no $i++
1031 while ($i <= $end && $found_base == 0) {
1032 for my $vwt ($self->getWt($strE->[$i][0])) {
1033 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
1034 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
1036 # "Ignorable (L1, L2) after Variable" since track. v. 9
1039 $last_is_variable = TRUE;
1041 elsif (!$wt[0]) { # ignorable
1042 $to_be_pushed = FALSE if $last_is_variable;
1045 $last_is_variable = FALSE;
1049 if (@strWt && !$var && !$wt[0]) {
1050 push @{ $strWt[-1] }, \@wt if $to_be_pushed;
1051 $finPos[-1] = $strE->[$i][2];
1052 } elsif ($to_be_pushed) {
1053 push @strWt, [ \@wt ];
1054 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
1055 $finPos[-1] = NOMATCHPOS if $found_base;
1056 push @finPos, $strE->[$i][2];
1065 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1066 if ($iniPos[0] != NOMATCHPOS &&
1067 $finPos[$#subWt] != NOMATCHPOS &&
1068 _eqArray(\@strWt, \@subWt, $lev)) {
1069 my $temp = $iniPos[0] + $pos;
1072 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1073 splice @strWt, 0, $#subWt;
1074 splice @iniPos, 0, $#subWt;
1075 splice @finPos, 0, $#subWt;
1079 ? ($temp, $finPos[$#subWt] - $iniPos[0])
1091 : wantarray ? () : NOMATCHPOS;
1095 ## scalarref to matching part = match(string, substring)
1100 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1101 my $temp = substr($_[0], $pos, $len);
1102 return wantarray ? $temp : \$temp;
1103 # An lvalue ref \substr should be avoided,
1104 # since its value is affected by modification of its referent.
1112 ## arrayref matching parts = gmatch(string, substring)
1119 return map substr($str, $_->[0], $_->[1]),
1120 $self->index($str, $sub, 0, 'g');
1124 ## bool subst'ed = subst(string, substring, replace)
1129 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1131 if (my($pos,$len) = $self->index($_[0], $_[1])) {
1133 my $mat = substr($_[0], $pos, $len);
1134 substr($_[0], $pos, $len, $code->($mat));
1136 substr($_[0], $pos, $len, $_[2]);
1146 ## int count = gsubst(string, substring, replace)
1151 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1154 # Replacement is carried out from the end, then use reverse.
1155 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1157 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1158 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1160 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1172 Unicode::Collate - Unicode Collation Algorithm
1176 use Unicode::Collate;
1179 $Collator = Unicode::Collate->new(%tailoring);
1182 @sorted = $Collator->sort(@not_sorted);
1185 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1187 # If %tailoring is false (i.e. empty),
1188 # $Collator should do the default collation.
1192 This module is an implementation of Unicode Technical Standard #10
1193 (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1195 =head2 Constructor and Tailoring
1197 The C<new> method returns a collator object.
1199 $Collator = Unicode::Collate->new(
1200 UCA_Version => $UCA_Version,
1201 alternate => $alternate, # deprecated: use of 'variable' is recommended.
1202 backwards => $levelNumber, # or \@levelNumbers
1204 hangul_terminator => $term_primary_weight,
1205 ignoreName => qr/$ignoreName/,
1206 ignoreChar => qr/$ignoreChar/,
1207 katakana_before_hiragana => $bool,
1208 level => $collationLevel,
1209 normalization => $normalization_form,
1210 overrideCJK => \&overrideCJK,
1211 overrideHangul => \&overrideHangul,
1212 preprocess => \&preprocess,
1213 rearrange => \@charList,
1215 undefName => qr/$undefName/,
1216 undefChar => qr/$undefChar/,
1217 upper_before_lower => $bool,
1218 variable => $variable,
1225 If the tracking version number of UCA is given,
1226 behavior of that tracking version is emulated on collating.
1227 If omitted, the return value of C<UCA_Version()> is used.
1228 C<UCA_Version()> should return the latest tracking version supported.
1230 The supported tracking version: 8, 9, 11, 14, 16, 18 or 20.
1232 UCA Unicode Standard DUCET (@version)
1233 ---------------------------------------------------
1234 8 3.1 3.0.1 (3.0.1d9)
1235 9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
1236 11 4.0 4.0.0 (4.0.0)
1237 14 4.1.0 4.1.0 (4.1.0)
1238 16 5.0 5.0.0 (5.0.0)
1239 18 5.1.0 5.1.0 (5.1.0)
1240 20 5.2.0 5.2.0 (5.2.0)
1242 Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1246 -- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1248 For backward compatibility, C<alternate> (old name) can be used
1249 as an alias for C<variable>.
1253 -- see 3.1.2 French Accents, UTS #10.
1255 backwards => $levelNumber or \@levelNumbers
1257 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1258 If omitted, forwards at all the levels.
1262 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1264 If the same character (or a sequence of characters) exists
1265 in the collation element table through C<table>,
1266 mapping to collation elements is overrided.
1267 If it does not exist, the mapping is defined additionally.
1269 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1270 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1271 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1272 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1273 006C 006C ; [.0F4C.0020.0002.006C] # ll
1274 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1275 004C 004C ; [.0F4C.0020.0008.004C] # LL
1276 00F1 ; [.0F7B.0020.0002.00F1] # n-tilde
1277 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1278 00D1 ; [.0F7B.0020.0008.00D1] # N-tilde
1279 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1282 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1283 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1284 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1287 B<NOTE:> The code point in the UCA file format (before C<';'>)
1288 B<must> be a Unicode code point (defined as hexadecimal),
1289 but not a native code point.
1290 So C<0063> must always denote C<U+0063>,
1291 but not a character of C<"\x63">.
1293 Weighting may vary depending on collation element table.
1294 So ensure the weights defined in C<entry> will be consistent with
1295 those in the collation element table loaded via C<table>.
1297 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1298 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1299 (as a value between C<0E60> and C<0E6D>)
1300 makes ordering as C<C E<lt> CH E<lt> D>.
1301 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1302 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1303 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1304 and C<c-curl> (C<U+0255>) with C<0E69>.
1305 Then primary weight C<0E6A> for C<CH> makes C<CH>
1306 ordered between C<c-curl> and C<D>.
1308 =item hangul_terminator
1310 -- see 7.1.4 Trailing Weights, UTS #10.
1312 If a true value is given (non-zero but should be positive),
1313 it will be added as a terminator primary weight to the end of
1314 every standard Hangul syllable. Secondary and any higher weights
1315 for terminator are set to zero.
1316 If the value is false or C<hangul_terminator> key does not exist,
1317 insertion of terminator weights will not be performed.
1319 Boundaries of Hangul syllables are determined
1320 according to conjoining Jamo behavior in F<the Unicode Standard>
1321 and F<HangulSyllableType.txt>.
1323 B<Implementation Note:>
1324 (1) For expansion mapping (Unicode character mapped
1325 to a sequence of collation elements), a terminator will not be added
1326 between collation elements, even if Hangul syllable boundary exists there.
1327 Addition of terminator is restricted to the next position
1328 to the last collation element.
1330 (2) Non-conjoining Hangul letters
1331 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1332 automatically terminated with a terminator primary weight.
1333 These characters may need terminator included in a collation element
1340 -- see 3.2.2 Variable Weighting, UTS #10.
1342 Makes the entry in the table completely ignorable;
1343 i.e. as if the weights were zero at all level.
1345 Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1346 will be ignored. Through C<ignoreName>, any character whose name
1347 (given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1350 E.g. when 'a' and 'e' are ignorable,
1351 'element' is equal to 'lament' (or 'lmnt').
1353 =item katakana_before_hiragana
1355 -- see 7.3.1 Tertiary Weight Table, UTS #10.
1357 By default, hiragana is before katakana.
1358 If the parameter is made true, this is reversed.
1360 B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1361 distinctions must occur in level 3, and their weights at level 3 must be
1362 same as those mentioned in 7.3.1, UTS #10.
1363 If you define your collation elements which violate this requirement,
1364 this parameter does not work validly.
1368 -- see 4.3 Form Sort Key, UTS #10.
1370 Set the maximum level.
1371 Any higher levels than the specified one are ignored.
1373 Level 1: alphabetic ordering
1374 Level 2: diacritic ordering
1375 Level 3: case ordering
1376 Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1380 If omitted, the maximum is the 4th.
1384 -- see 4.1 Normalize, UTS #10.
1386 If specified, strings are normalized before preparation of sort keys
1387 (the normalization is executed after preprocess).
1389 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1390 as C<$normalization_form>.
1391 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1392 See C<Unicode::Normalize::normalize()> for detail.
1393 If omitted, C<'NFD'> is used.
1395 C<normalization> is performed after C<preprocess> (if defined).
1397 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1398 though they are not concerned with C<Unicode::Normalize::normalize()>.
1400 If C<undef> (not a string C<"undef">) is passed explicitly
1401 as the value for this key,
1402 any normalization is not carried out (this may make tailoring easier
1403 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1404 only contiguous contractions are resolved;
1405 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1406 C<A-cedilla-ring> would be primary equal to C<A>.
1408 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1409 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1411 In the case of C<(normalization =E<gt> "prenormalized")>,
1412 any normalization is not performed, but
1413 non-contiguous contractions with combining characters are performed.
1415 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1416 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1417 If source strings are finely prenormalized,
1418 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1420 Except C<(normalization =E<gt> undef)>,
1421 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1425 -- see 7.1 Derived Collation Elements, UTS #10.
1427 By default, CJK Unified Ideographs are ordered in Unicode codepoint
1428 order but C<CJK Unified Ideographs> are lesser than
1429 C<CJK Unified Ideographs Extension>.
1431 CJK Unified Ideographs:
1432 U+4E00..U+9FA5 if UCA_Version is 8 to 11;
1433 U+4E00..U+9FBB if UCA_Version is 14 to 16;
1434 U+4E00..U+9FC3 if UCA_Version is 18;
1435 U+4E00..U+9FCB if UCA_Version> is 20.
1437 CJK Unified Ideographs Extension:
1438 Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) if UCA_Version < 20;
1439 Ext.A, Ext.B and Ext.C (U+2A700..U+2B734) if UCA_Version is 20.
1441 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1443 ex. CJK Unified Ideographs in the JIS code point order.
1445 overrideCJK => sub {
1446 my $u = shift; # get a Unicode codepoint
1447 my $b = pack('n', $u); # to UTF-16BE
1448 my $s = your_unicode_to_sjis_converter($b); # convert
1449 my $n = unpack('n', $s); # convert sjis to short
1450 [ $n, 0x20, 0x2, $u ]; # return the collation element
1453 ex. ignores all CJK Unified Ideographs.
1455 overrideCJK => sub {()}, # CODEREF returning empty list
1457 # where ->eq("Pe\x{4E00}rl", "Perl") is true
1458 # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1460 If C<undef> is passed explicitly as the value for this key,
1461 weights for CJK Unified Ideographs are treated as undefined.
1462 But assignment of weight for CJK Unified Ideographs
1463 in table or C<entry> is still valid.
1465 =item overrideHangul
1467 -- see 7.1 Derived Collation Elements, UTS #10.
1469 By default, Hangul Syllables are decomposed into Hangul Jamo,
1470 even if C<(normalization =E<gt> undef)>.
1471 But the mapping of Hangul Syllables may be overrided.
1473 This parameter works like C<overrideCJK>, so see there for examples.
1475 If you want to override the mapping of Hangul Syllables,
1476 NFD, NFKD, and FCD are not appropriate,
1477 since they will decompose Hangul Syllables before overriding.
1479 If C<undef> is passed explicitly as the value for this key,
1480 weight for Hangul Syllables is treated as undefined
1481 without decomposition into Hangul Jamo.
1482 But definition of weight for Hangul Syllables
1483 in table or C<entry> is still valid.
1487 -- see 5.1 Preprocessing, UTS #10.
1489 If specified, the coderef is used to preprocess
1490 before the formation of sort keys.
1492 ex. dropping English articles, such as "a" or "the".
1493 Then, "the pen" is before "a pencil".
1497 $str =~ s/\b(?:an?|the)\s+//gi;
1501 C<preprocess> is performed before C<normalization> (if defined).
1505 -- see 3.1.3 Rearrangement, UTS #10.
1507 Characters that are not coded in logical order and to be rearranged.
1508 If C<UCA_Version> is equal to or lesser than 11, default is:
1510 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1512 If you want to disallow any rearrangement, pass C<undef> or C<[]>
1513 (a reference to empty list) as the value for this key.
1515 If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1516 (i.e. no rearrangement).
1518 B<According to the version 9 of UCA, this parameter shall not be used;
1519 but it is not warned at present.>
1523 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1525 You can use another collation element table if desired.
1527 The table file should locate in the F<Unicode/Collate> directory
1528 on C<@INC>. Say, if the filename is F<Foo.txt>,
1529 the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1531 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1532 If you will prepare your own table file, any name other than F<allkeys.txt>
1533 may be better to avoid namespace conflict.
1535 B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1536 module, and it may save time at the run time.
1537 Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
1538 or using C<ignoreChar>, C<ignoreName>, C<undefChar>, or C<undefName>
1539 will prevent this module using the compiled DUCET.
1541 If C<undef> is passed explicitly as the value for this key,
1542 no file is read (but you can define collation elements via C<entry>).
1544 A typical way to define a collation element table
1545 without any file of table:
1547 $onlyABC = Unicode::Collate->new(
1549 entry => << 'ENTRIES',
1550 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1551 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1552 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1553 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1554 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1555 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1559 If C<ignoreName> or C<undefName> is used, character names should be
1560 specified as a comment (following C<#>) on each line.
1566 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1568 Undefines the collation element as if it were unassigned in the table.
1569 This reduces the size of the table.
1570 If an unassigned character appears in the string to be collated,
1571 the sort key is made from its codepoint
1572 as a single-character collation element,
1573 as it is greater than any other assigned collation elements
1574 (in the codepoint order among the unassigned characters).
1575 But, it'd be better to ignore characters
1576 unfamiliar to you and maybe never used.
1578 Through C<undefChar>, any character matching C<qr/$undefChar/>
1579 will be undefined. Through C<undefName>, any character whose name
1580 (given in the C<table> file as a comment) matches C<qr/$undefName/>
1583 ex. Collation weights for beyond-BMP characters are not stored in object:
1585 undefChar => qr/[^\0-\x{fffd}]/,
1587 =item upper_before_lower
1589 -- see 6.6 Case Comparisons, UTS #10.
1591 By default, lowercase is before uppercase.
1592 If the parameter is made true, this is reversed.
1594 B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1595 distinctions must occur in level 3, and their weights at level 3 must be
1596 same as those mentioned in 7.3.1, UTS #10.
1597 If you define your collation elements which differs from this requirement,
1598 this parameter doesn't work validly.
1602 -- see 3.2.2 Variable Weighting, UTS #10.
1604 This key allows to variable weighting for variable collation elements,
1605 which are marked with an ASTERISK in the table
1606 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1608 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1610 These names are case-insensitive.
1611 By default (if specification is omitted), 'shifted' is adopted.
1613 'Blanked' Variable elements are made ignorable at levels 1 through 3;
1614 considered at the 4th level.
1616 'Non-Ignorable' Variable elements are not reset to ignorable.
1618 'Shifted' Variable elements are made ignorable at levels 1 through 3
1619 their level 4 weight is replaced by the old level 1 weight.
1620 Level 4 weight for Non-Variable elements is 0xFFFF.
1622 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
1627 =head2 Methods for Collation
1631 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1633 Sorts a list of strings.
1635 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1637 Returns 1 (when C<$a> is greater than C<$b>)
1638 or 0 (when C<$a> is equal to C<$b>)
1639 or -1 (when C<$a> is lesser than C<$b>).
1641 =item C<$result = $Collator-E<gt>eq($a, $b)>
1643 =item C<$result = $Collator-E<gt>ne($a, $b)>
1645 =item C<$result = $Collator-E<gt>lt($a, $b)>
1647 =item C<$result = $Collator-E<gt>le($a, $b)>
1649 =item C<$result = $Collator-E<gt>gt($a, $b)>
1651 =item C<$result = $Collator-E<gt>ge($a, $b)>
1653 They works like the same name operators as theirs.
1655 eq : whether $a is equal to $b.
1656 ne : whether $a is not equal to $b.
1657 lt : whether $a is lesser than $b.
1658 le : whether $a is lesser than $b or equal to $b.
1659 gt : whether $a is greater than $b.
1660 ge : whether $a is greater than $b or equal to $b.
1662 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1664 -- see 4.3 Form Sort Key, UTS #10.
1668 You compare the sort keys using a binary comparison
1669 and get the result of the comparison of the strings using UCA.
1671 $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1675 $Collator->cmp($a, $b)
1677 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1679 Converts a sorting key into its representation form.
1680 If C<UCA_Version> is 8, the output is slightly different.
1682 use Unicode::Collate;
1683 my $c = Unicode::Collate->new();
1684 print $c->viewSortKey("Perl"),"\n";
1687 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1688 # Level 1 Level 2 Level 3 Level 4
1692 =head2 Methods for Searching
1694 B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1695 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1696 C<subst>, C<gsubst>) is croaked,
1697 as the position and the length might differ
1698 from those on the specified string.
1699 (And C<rearrange> and C<hangul_terminator> parameters are neglected.)
1701 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1702 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1703 but they are not aware of any pattern, but only a literal substring.
1707 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1709 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1711 If C<$substring> matches a part of C<$string>, returns
1712 the position of the first occurrence of the matching part in scalar context;
1713 in list context, returns a two-element list of
1714 the position and the length of the matching part.
1716 If C<$substring> does not match any part of C<$string>,
1717 returns C<-1> in scalar context and
1718 an empty list in list context.
1722 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1723 # (normalization => undef) is REQUIRED.
1724 my $str = "Ich muß studieren Perl.";
1727 if (my($pos,$len) = $Collator->index($str, $sub)) {
1728 $match = substr($str, $pos, $len);
1731 and get C<"muß"> in C<$match> since C<"muß">
1732 is primary equal to C<"MÜSS">.
1734 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1736 =item C<($match) = $Collator-E<gt>match($string, $substring)>
1738 If C<$substring> matches a part of C<$string>, in scalar context, returns
1739 B<a reference to> the first occurrence of the matching part
1740 (C<$match_ref> is always true if matches,
1741 since every reference is B<true>);
1742 in list context, returns the first occurrence of the matching part.
1744 If C<$substring> does not match any part of C<$string>,
1745 returns C<undef> in scalar context and
1746 an empty list in list context.
1750 if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1751 print "matches [$$match_ref].\n";
1753 print "doesn't match.\n";
1758 if (($match) = $Collator->match($str, $sub)) { # list context
1759 print "matches [$match].\n";
1761 print "doesn't match.\n";
1764 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1766 If C<$substring> matches a part of C<$string>, returns
1767 all the matching parts (or matching count in scalar context).
1769 If C<$substring> does not match any part of C<$string>,
1770 returns an empty list.
1772 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1774 If C<$substring> matches a part of C<$string>,
1775 the first occurrence of the matching part is replaced by C<$replacement>
1776 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1778 C<$replacement> can be a C<CODEREF>,
1779 taking the matching part as an argument,
1780 and returning a string to replace the matching part
1781 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1783 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1785 If C<$substring> matches a part of C<$string>,
1786 all the occurrences of the matching part is replaced by C<$replacement>
1787 (C<$string> is modified) and return C<$count>.
1789 C<$replacement> can be a C<CODEREF>,
1790 taking the matching part as an argument,
1791 and returning a string to replace the matching part
1792 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1796 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1797 # (normalization => undef) is REQUIRED.
1798 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
1799 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1801 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1802 # i.e., all the camels are made bold-faced.
1806 =head2 Other Methods
1810 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1812 Change the value of specified keys and returns the changed part.
1814 $Collator = Unicode::Collate->new(level => 4);
1816 $Collator->eq("perl", "PERL"); # false
1818 %old = $Collator->change(level => 2); # returns (level => 4).
1820 $Collator->eq("perl", "PERL"); # true
1822 $Collator->change(%old); # returns (level => 2).
1824 $Collator->eq("perl", "PERL"); # false
1826 Not all C<(key,value)>s are allowed to be changed.
1827 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1829 In the scalar context, returns the modified collator
1830 (but it is B<not> a clone from the original).
1832 $Collator->change(level => 2)->eq("perl", "PERL"); # true
1834 $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1836 $Collator->change(level => 4)->eq("perl", "PERL"); # false
1838 =item C<$version = $Collator-E<gt>version()>
1840 Returns the version number (a string) of the Unicode Standard
1841 which the C<table> file used by the collator object is based on.
1842 If the table does not include a version line (starting with C<@version>),
1843 returns C<"unknown">.
1845 =item C<UCA_Version()>
1847 Returns the tracking version number of UTS #10 this module consults.
1849 =item C<Base_Unicode_Version()>
1851 Returns the version number of UTS #10 this module consults.
1857 No method will be exported.
1861 Though this module can be used without any C<table> file,
1862 to use this module easily, it is recommended to install a table file
1863 in the UCA format, by copying it under the directory
1864 <a place in @INC>/Unicode/Collate.
1866 The most preferable one is "The Default Unicode Collation Element Table"
1867 (aka DUCET), available from the Unicode Consortium's website:
1869 http://www.unicode.org/Public/UCA/
1871 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1873 If DUCET is not installed, it is recommended to copy the file
1874 from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1875 to <a place in @INC>/Unicode/Collate/allkeys.txt
1884 Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1885 module (see L<Unicode::Normalize>).
1887 If you need not it (say, in the case when you need not
1888 handle any combining characters),
1889 assign C<normalization =E<gt> undef> explicitly.
1891 -- see 6.5 Avoiding Normalization, UTS #10.
1893 =item Conformance Test
1895 The Conformance Test for the UCA is available
1896 under L<http://www.unicode.org/Public/UCA/>.
1898 For F<CollationTest_SHIFTED.txt>,
1899 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1900 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1901 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1903 B<Unicode::Normalize is required to try The Conformance Test.>
1907 =head1 AUTHOR, COPYRIGHT AND LICENSE
1909 The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1910 <SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2010,
1911 SADAHIRO Tomoyuki. Japan. All rights reserved.
1913 This module is free software; you can redistribute it and/or
1914 modify it under the same terms as Perl itself.
1916 The file Unicode/Collate/allkeys.txt was copied verbatim
1917 from L<http://www.unicode.org/Public/UCA/5.2.0/allkeys.txt>.
1918 This file is Copyright (c) 1991-2009 Unicode, Inc. All rights reserved.
1919 Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
1925 =item Unicode Collation Algorithm - UTS #10
1927 L<http://www.unicode.org/reports/tr10/>
1929 =item The Default Unicode Collation Element Table (DUCET)
1931 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1933 =item The conformance test for the UCA
1935 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1937 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1939 =item Hangul Syllable Type
1941 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1943 =item Unicode Normalization Forms - UAX #15
1945 L<http://www.unicode.org/reports/tr15/>