cpan/Unicode-Collate/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41)) {
   5         die "Unicode::Collate cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13 use File::Spec;
  14
  15 no warnings 'utf8';
  16
  17 our $VERSION = '0.56';
  18 our $PACKAGE = __PACKAGE__;
  19
  20 my @Path = qw(Unicode Collate);
  21 my $KeyFile = "allkeys.txt";
  22
  23 # Perl's boolean
  24 use constant TRUE  => 1;
  25 use constant FALSE => "";
  26 use constant NOMATCHPOS => -1;
  27
  28 # A coderef to get combining class imported from Unicode::Normalize
  29 # (i.e. \&Unicode::Normalize::getCombinClass).
  30 # This is also used as a HAS_UNICODE_NORMALIZE flag.
  31 my $CVgetCombinClass;
  32
  33 # Supported Levels
  34 use constant MinLevel => 1;
  35 use constant MaxLevel => 4;
  36
  37 # Minimum weights at level 2 and 3, respectively
  38 use constant Min2Wt => 0x20;
  39 use constant Min3Wt => 0x02;
  40
  41 # Shifted weight at 4th level
  42 use constant Shift4Wt => 0xFFFF;
  43
  44 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
  45 # PROBLEM: The Default Unicode Collation Element Table
  46 # has weights over 0xFFFF at the 4th level.
  47 # The tie-breaking in the variable weights
  48 # other than "shift" (as well as "shift-trimmed") is unreliable.
  49 use constant VCE_TEMPLATE => 'Cn4';
  50
  51 # A sort key: 16-bit weights
  52 # See also the PROBLEM on VCE_TEMPLATE above.
  53 use constant KEY_TEMPLATE => 'n*';
  54
  55 # Level separator in a sort key:
  56 # i.e. pack(KEY_TEMPLATE, 0)
  57 use constant LEVEL_SEP => "\0\0";
  58
  59 # As Unicode code point separator for hash keys.
  60 # A joined code point string (denoted by JCPS below)
  61 # like "65;768" is used for internal processing
  62 # instead of Perl's Unicode string like "\x41\x{300}",
  63 # as the native code point is different from the Unicode code point
  64 # on EBCDIC platform.
  65 # This character must not be included in any stringified
  66 # representation of an integer.
  67 use constant CODE_SEP => ';';
  68
  69 # boolean values of variable weights
  70 use constant NON_VAR => 0; # Non-Variable character
  71 use constant VAR     => 1; # Variable character
  72
  73 # specific code points
  74 use constant Hangul_SBase  => 0xAC00;
  75 use constant Hangul_SIni   => 0xAC00;
  76 use constant Hangul_SFin   => 0xD7A3;
  77 use constant Hangul_NCount => 588;
  78 use constant Hangul_TCount => 28;
  79 use constant Hangul_LBase  => 0x1100;
  80 use constant Hangul_LIni   => 0x1100;
  81 use constant Hangul_LFin   => 0x1159;
  82 use constant Hangul_LFill  => 0x115F;
  83 use constant Hangul_LEnd   => 0x115F; # Unicode 5.2.0
  84 use constant Hangul_VBase  => 0x1161;
  85 use constant Hangul_VIni   => 0x1160; # from Vowel Filler
  86 use constant Hangul_VFin   => 0x11A2;
  87 use constant Hangul_VEnd   => 0x11A7; # Unicode 5.2.0
  88 use constant Hangul_TBase  => 0x11A7; # from "no-final" codepoint
  89 use constant Hangul_TIni   => 0x11A8;
  90 use constant Hangul_TFin   => 0x11F9;
  91 use constant Hangul_TEnd   => 0x11FF; # Unicode 5.2.0
  92 use constant HangulL2Ini   => 0xA960; # Unicode 5.2.0
  93 use constant HangulL2Fin   => 0xA97C; # Unicode 5.2.0
  94 use constant HangulV2Ini   => 0xD7B0; # Unicode 5.2.0
  95 use constant HangulV2Fin   => 0xD7C6; # Unicode 5.2.0
  96 use constant HangulT2Ini   => 0xD7CB; # Unicode 5.2.0
  97 use constant HangulT2Fin   => 0xD7FB; # Unicode 5.2.0
  98
  99 use constant CJK_UidIni    => 0x4E00;
 100 use constant CJK_UidFin    => 0x9FA5;
 101 use constant CJK_UidF41    => 0x9FBB;
 102 use constant CJK_UidF51    => 0x9FC3;
 103 use constant CJK_UidF52    => 0x9FCB;
 104 use constant CJK_ExtAIni   => 0x3400;
 105 use constant CJK_ExtAFin   => 0x4DB5;
 106 use constant CJK_ExtBIni   => 0x20000;
 107 use constant CJK_ExtBFin   => 0x2A6D6;
 108 use constant CJK_ExtCIni   => 0x2A700; # Unicode 5.2.0
 109 use constant CJK_ExtCFin   => 0x2B734; # Unicode 5.2.0
 110
 111 # Logical_Order_Exception in PropList.txt
 112 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
 113
 114 sub UCA_Version { "20" }
 115
 116 sub Base_Unicode_Version { "5.2.0" }
 117
 118 ######
 119
 120 sub pack_U {
 121     return pack('U*', @_);
 122 }
 123
 124 sub unpack_U {
 125     return unpack('U*', shift(@_).pack('U*'));
 126 }
 127
 128 ######
 129
 130 my (%VariableOK);
 131 @VariableOK{ qw/
 132     blanked  non-ignorable  shifted  shift-trimmed
 133   / } = (); # keys lowercased
 134
 135 our @ChangeOK = qw/
 136     alternate backwards level normalization rearrange
 137     katakana_before_hiragana upper_before_lower
 138     overrideHangul overrideCJK preprocess UCA_Version
 139     hangul_terminator variable
 140   /;
 141
 142 our @ChangeNG = qw/
 143     entry mapping table maxlength
 144     ignoreChar ignoreName undefChar undefName variableTable
 145     versionTable alternateTable backwardsTable forwardsTable rearrangeTable
 146     derivCode normCode rearrangeHash backwardsFlag
 147   /;
 148 # The hash key 'ignored' is deleted at v 0.21.
 149 # The hash key 'isShift' is deleted at v 0.23.
 150 # The hash key 'combining' is deleted at v 0.24.
 151 # The hash key 'entries' is deleted at v 0.30.
 152 # The hash key 'L3_ignorable' is deleted at v 0.40.
 153
 154 sub version {
 155     my $self = shift;
 156     return $self->{versionTable} || 'unknown';
 157 }
 158
 159 my (%ChangeOK, %ChangeNG);
 160 @ChangeOK{ @ChangeOK } = ();
 161 @ChangeNG{ @ChangeNG } = ();
 162
 163 sub change {
 164     my $self = shift;
 165     my %hash = @_;
 166     my %old;
 167     if (exists $hash{variable} && exists $hash{alternate}) {
 168         delete $hash{alternate};
 169     }
 170     elsif (!exists $hash{variable} && exists $hash{alternate}) {
 171         $hash{variable} = $hash{alternate};
 172     }
 173     foreach my $k (keys %hash) {
 174         if (exists $ChangeOK{$k}) {
 175             $old{$k} = $self->{$k};
 176             $self->{$k} = $hash{$k};
 177         }
 178         elsif (exists $ChangeNG{$k}) {
 179             croak "change of $k via change() is not allowed!";
 180         }
 181         # else => ignored
 182     }
 183     $self->checkCollator();
 184     return wantarray ? %old : $self;
 185 }
 186
 187 sub _checkLevel {
 188     my $level = shift;
 189     my $key   = shift; # 'level' or 'backwards'
 190     MinLevel <= $level or croak sprintf
 191         "Illegal level %d (in value for key '%s') lower than %d.",
 192             $level, $key, MinLevel;
 193     $level <= MaxLevel or croak sprintf
 194         "Unsupported level %d (in value for key '%s') higher than %d.",
 195             $level, $key, MaxLevel;
 196 }
 197
 198 my %DerivCode = (
 199     8 => \&_derivCE_8,
 200     9 => \&_derivCE_9,
 201    11 => \&_derivCE_9, # 11 == 9
 202    14 => \&_derivCE_14,
 203    16 => \&_derivCE_14, # 16 == 14
 204    18 => \&_derivCE_18,
 205    20 => \&_derivCE_20,
 206 );
 207
 208 sub checkCollator {
 209     my $self = shift;
 210     _checkLevel($self->{level}, "level");
 211
 212     $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
 213         or croak "Illegal UCA version (passed $self->{UCA_Version}).";
 214
 215     $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
 216                                 $self->{alternateTable} || 'shifted';
 217     $self->{variable} = $self->{alternate} = lc($self->{variable});
 218     exists $VariableOK{ $self->{variable} }
 219         or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
 220
 221     if (! defined $self->{backwards}) {
 222         $self->{backwardsFlag} = 0;
 223     }
 224     elsif (! ref $self->{backwards}) {
 225         _checkLevel($self->{backwards}, "backwards");
 226         $self->{backwardsFlag} = 1 << $self->{backwards};
 227     }
 228     else {
 229         my %level;
 230         $self->{backwardsFlag} = 0;
 231         for my $b (@{ $self->{backwards} }) {
 232             _checkLevel($b, "backwards");
 233             $level{$b} = 1;
 234         }
 235         for my $v (sort keys %level) {
 236             $self->{backwardsFlag} += 1 << $v;
 237         }
 238     }
 239
 240     defined $self->{rearrange} or $self->{rearrange} = [];
 241     ref $self->{rearrange}
 242         or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
 243
 244     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 245     $self->{rearrangeHash} = undef;
 246
 247     if (@{ $self->{rearrange} }) {
 248         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 249     }
 250
 251     $self->{normCode} = undef;
 252
 253     if (defined $self->{normalization}) {
 254         eval { require Unicode::Normalize };
 255         $@ and croak "Unicode::Normalize is required to normalize strings";
 256
 257         $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
 258
 259         if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
 260             $self->{normCode} = \&Unicode::Normalize::NFD;
 261         }
 262         elsif ($self->{normalization} ne 'prenormalized') {
 263             my $norm = $self->{normalization};
 264             $self->{normCode} = sub {
 265                 Unicode::Normalize::normalize($norm, shift);
 266             };
 267             eval { $self->{normCode}->("") }; # try
 268             $@ and croak "$PACKAGE unknown normalization form name: $norm";
 269         }
 270     }
 271     return;
 272 }
 273
 274 sub new
 275 {
 276     my $class = shift;
 277     my $self = bless { @_ }, $class;
 278
 279     # If undef is passed explicitly, no file is read.
 280     $self->{table} = $KeyFile if ! exists $self->{table};
 281     $self->read_table() if defined $self->{table};
 282
 283     if ($self->{entry}) {
 284         while ($self->{entry} =~ /([^\n]+)/g) {
 285             $self->parseEntry($1);
 286         }
 287     }
 288
 289     $self->{level} ||= MaxLevel;
 290     $self->{UCA_Version} ||= UCA_Version();
 291
 292     $self->{overrideHangul} = FALSE
 293         if ! exists $self->{overrideHangul};
 294     $self->{overrideCJK} = FALSE
 295         if ! exists $self->{overrideCJK};
 296     $self->{normalization} = 'NFD'
 297         if ! exists $self->{normalization};
 298     $self->{rearrange} = $self->{rearrangeTable} ||
 299         ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
 300         if ! exists $self->{rearrange};
 301     $self->{backwards} = $self->{backwardsTable}
 302         if ! exists $self->{backwards};
 303
 304     $self->checkCollator();
 305
 306     return $self;
 307 }
 308
 309 sub parseAtmark {
 310     my $self = shift;
 311     my $line = shift; # after s/^\s*\@//
 312
 313     if ($line =~ /^version\s*(\S*)/) {
 314         $self->{versionTable} ||= $1;
 315     }
 316     elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
 317         $self->{variableTable} ||= $1;
 318     }
 319     elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
 320         $self->{alternateTable} ||= $1;
 321     }
 322     elsif ($line =~ /^backwards\s+(\S*)/) {
 323         push @{ $self->{backwardsTable} }, $1;
 324     }
 325     elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
 326         push @{ $self->{forwardsTable} }, $1;
 327     }
 328     elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
 329         push @{ $self->{rearrangeTable} }, _getHexArray($1);
 330     }
 331 }
 332
 333 sub read_table {
 334     my $self = shift;
 335
 336     my($f, $fh);
 337     foreach my $d (@INC) {
 338         $f = File::Spec->catfile($d, @Path, $self->{table});
 339         last if open($fh, $f);
 340         $f = undef;
 341     }
 342     if (!defined $f) {
 343         $f = File::Spec->catfile(@Path, $self->{table});
 344         croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
 345     }
 346
 347     while (my $line = <$fh>) {
 348         next if $line =~ /^\s*#/;
 349
 350         if ($line =~ s/^\s*\@//) {
 351             $self->parseAtmark($line);
 352         } else {
 353             $self->parseEntry($line);
 354         }
 355     }
 356     close $fh;
 357 }
 358
 359
 360 ##
 361 ## get $line, parse it, and write an entry in $self
 362 ##
 363 sub parseEntry
 364 {
 365     my $self = shift;
 366     my $line = shift;
 367     my($name, $entry, @uv, @key);
 368
 369     return if $line !~ /^\s*[0-9A-Fa-f]/;
 370
 371     # removes comment and gets name
 372     $name = $1
 373         if $line =~ s/[#%]\s*(.*)//;
 374     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 375
 376     # gets element
 377     my($e, $k) = split /;/, $line;
 378     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 379         if ! $k;
 380
 381     @uv = _getHexArray($e);
 382     return if !@uv;
 383
 384     $entry = join(CODE_SEP, @uv); # in JCPS
 385
 386     if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
 387         my $ele = pack_U(@uv);
 388
 389         # regarded as if it were not entried in the table
 390         return
 391             if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 392
 393         # replaced as completely ignorable
 394         $k = '[.0000.0000.0000.0000]'
 395             if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
 396     }
 397
 398     # replaced as completely ignorable
 399     $k = '[.0000.0000.0000.0000]'
 400         if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
 401
 402     my $is_L3_ignorable = TRUE;
 403
 404     foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 405         my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 406         my @wt = _getHexArray($arr);
 407         push @key, pack(VCE_TEMPLATE, $var, @wt);
 408         $is_L3_ignorable = FALSE
 409             if $wt[0] || $wt[1] || $wt[2];
 410         # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
 411         # is completely ignorable.
 412         # For expansion, an entry $is_L3_ignorable
 413         # if and only if "all" CEs are [.0000.0000.0000].
 414     }
 415
 416     $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
 417
 418     if (@uv > 1) {
 419         (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
 420             and $self->{maxlength}{$uv[0]} = @uv;
 421     }
 422 }
 423
 424
 425 ##
 426 ## VCE = _varCE(variable term, VCE)
 427 ##
 428 sub _varCE
 429 {
 430     my $vbl = shift;
 431     my $vce = shift;
 432     if ($vbl eq 'non-ignorable') {
 433         return $vce;
 434     }
 435     my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
 436
 437     if ($var) {
 438         return pack(VCE_TEMPLATE, $var, 0, 0, 0,
 439                 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
 440     }
 441     elsif ($vbl eq 'blanked') {
 442         return $vce;
 443     }
 444     else {
 445         return pack(VCE_TEMPLATE, $var, @wt[0..2],
 446             $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
 447     }
 448 }
 449
 450 sub viewSortKey
 451 {
 452     my $self = shift;
 453     $self->visualizeSortKey($self->getSortKey(@_));
 454 }
 455
 456 sub visualizeSortKey
 457 {
 458     my $self = shift;
 459     my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
 460
 461     if ($self->{UCA_Version} <= 8) {
 462         $view =~ s/ ?0000 ?/|/g;
 463     } else {
 464         $view =~ s/\b0000\b/|/g;
 465     }
 466     return "[$view]";
 467 }
 468
 469
 470 ##
 471 ## arrayref of JCPS   = splitEnt(string to be collated)
 472 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
 473 ##
 474 sub splitEnt
 475 {
 476     my $self = shift;
 477     my $wLen = $_[1];
 478
 479     my $code = $self->{preprocess};
 480     my $norm = $self->{normCode};
 481     my $map  = $self->{mapping};
 482     my $max  = $self->{maxlength};
 483     my $reH  = $self->{rearrangeHash};
 484     my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
 485
 486     my ($str, @buf);
 487
 488     if ($wLen) {
 489         $code and croak "Preprocess breaks character positions. "
 490                         . "Don't use with index(), match(), etc.";
 491         $norm and croak "Normalization breaks character positions. "
 492                         . "Don't use with index(), match(), etc.";
 493         $str = $_[0];
 494     }
 495     else {
 496         $str = $_[0];
 497         $str = &$code($str) if ref $code;
 498         $str = &$norm($str) if ref $norm;
 499     }
 500
 501     # get array of Unicode code point of string.
 502     my @src = unpack_U($str);
 503
 504     # rearrangement:
 505     # Character positions are not kept if rearranged,
 506     # then neglected if $wLen is true.
 507     if ($reH && ! $wLen) {
 508         for (my $i = 0; $i < @src; $i++) {
 509             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 510                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 511                 $i++;
 512             }
 513         }
 514     }
 515
 516     # remove a code point marked as a completely ignorable.
 517     for (my $i = 0; $i < @src; $i++) {
 518         $src[$i] = undef
 519             if _isIllegal($src[$i]) || ($ver9 &&
 520                 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
 521     }
 522
 523     for (my $i = 0; $i < @src; $i++) {
 524         my $jcps = $src[$i];
 525
 526         # skip removed code point
 527         if (! defined $jcps) {
 528             if ($wLen && @buf) {
 529                 $buf[-1][2] = $i + 1;
 530             }
 531             next;
 532         }
 533
 534         my $i_orig = $i;
 535
 536         # find contraction
 537         if ($max->{$jcps}) {
 538             my $temp_jcps = $jcps;
 539             my $jcpsLen = 1;
 540             my $maxLen = $max->{$jcps};
 541
 542             for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
 543                 next if ! defined $src[$p];
 544                 $temp_jcps .= CODE_SEP . $src[$p];
 545                 $jcpsLen++;
 546                 if ($map->{$temp_jcps}) {
 547                     $jcps = $temp_jcps;
 548                     $i = $p;
 549                 }
 550             }
 551
 552         # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
 553         # This process requires Unicode::Normalize.
 554         # If "normalization" is undef, here should be skipped *always*
 555         # (in spite of bool value of $CVgetCombinClass),
 556         # since canonical ordering cannot be expected.
 557         # Blocked combining character should not be contracted.
 558
 559             if ($self->{normalization})
 560             # $self->{normCode} is false in the case of "prenormalized".
 561             {
 562                 my $preCC = 0;
 563                 my $curCC = 0;
 564
 565                 for (my $p = $i + 1; $p < @src; $p++) {
 566                     next if ! defined $src[$p];
 567                     $curCC = $CVgetCombinClass->($src[$p]);
 568                     last unless $curCC;
 569                     my $tail = CODE_SEP . $src[$p];
 570                     if ($preCC != $curCC && $map->{$jcps.$tail}) {
 571                         $jcps .= $tail;
 572                         $src[$p] = undef;
 573                     } else {
 574                         $preCC = $curCC;
 575                     }
 576                 }
 577             }
 578         }
 579
 580         # skip completely ignorable
 581         if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
 582             if ($wLen && @buf) {
 583                 $buf[-1][2] = $i + 1;
 584             }
 585             next;
 586         }
 587
 588         push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
 589     }
 590     return \@buf;
 591 }
 592
 593
 594 ##
 595 ## list of VCE = getWt(JCPS)
 596 ##
 597 sub getWt
 598 {
 599     my $self = shift;
 600     my $u    = shift;
 601     my $vbl  = $self->{variable};
 602     my $map  = $self->{mapping};
 603     my $der  = $self->{derivCode};
 604
 605     return if !defined $u;
 606     return map(_varCE($vbl, $_), @{ $map->{$u} })
 607         if $map->{$u};
 608
 609     # JCPS must not be a contraction, then it's a code point.
 610     if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
 611         my $hang = $self->{overrideHangul};
 612         my @hangulCE;
 613         if ($hang) {
 614             @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
 615         }
 616         elsif (!defined $hang) {
 617             @hangulCE = $der->($u);
 618         }
 619         else {
 620             my $max  = $self->{maxlength};
 621             my @decH = _decompHangul($u);
 622
 623             if (@decH == 2) {
 624                 my $contract = join(CODE_SEP, @decH);
 625                 @decH = ($contract) if $map->{$contract};
 626             } else { # must be <@decH == 3>
 627                 if ($max->{$decH[0]}) {
 628                     my $contract = join(CODE_SEP, @decH);
 629                     if ($map->{$contract}) {
 630                         @decH = ($contract);
 631                     } else {
 632                         $contract = join(CODE_SEP, @decH[0,1]);
 633                         $map->{$contract} and @decH = ($contract, $decH[2]);
 634                     }
 635                     # even if V's ignorable, LT contraction is not supported.
 636                     # If such a situatution were required, NFD should be used.
 637                 }
 638                 if (@decH == 3 && $max->{$decH[1]}) {
 639                     my $contract = join(CODE_SEP, @decH[1,2]);
 640                     $map->{$contract} and @decH = ($decH[0], $contract);
 641                 }
 642             }
 643
 644             @hangulCE = map({
 645                     $map->{$_} ? @{ $map->{$_} } : $der->($_);
 646                 } @decH);
 647         }
 648         return map _varCE($vbl, $_), @hangulCE;
 649     }
 650     elsif (_isUIdeo($u, $self->{UCA_Version})) {
 651         my $cjk  = $self->{overrideCJK};
 652         return map _varCE($vbl, $_),
 653             $cjk
 654                 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
 655                 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
 656                     ? _uideoCE_8($u)
 657                     : $der->($u);
 658     }
 659     else {
 660         return map _varCE($vbl, $_), $der->($u);
 661     }
 662 }
 663
 664
 665 ##
 666 ## string sortkey = getSortKey(string arg)
 667 ##
 668 sub getSortKey
 669 {
 670     my $self = shift;
 671     my $lev  = $self->{level};
 672     my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
 673     my $vers = $self->{UCA_Version};
 674     my $v2i  = $vers >= 9 && $self->{variable} ne 'non-ignorable';
 675
 676     my @buf; # weight arrays
 677     if ($self->{hangul_terminator}) {
 678         my $preHST = '';
 679         foreach my $jcps (@$rEnt) {
 680             # weird things like VL, TL-contraction are not considered!
 681             my $curHST = '';
 682             foreach my $u (split /;/, $jcps) {
 683                 $curHST .= getHST($u, $vers);
 684             }
 685             if ($preHST && !$curHST || # hangul before non-hangul
 686                 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
 687                 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
 688                 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
 689
 690                 push @buf, $self->getWtHangulTerm();
 691             }
 692             $preHST = $curHST;
 693
 694             push @buf, $self->getWt($jcps);
 695         }
 696         $preHST # end at hangul
 697             and push @buf, $self->getWtHangulTerm();
 698     }
 699     else {
 700         foreach my $jcps (@$rEnt) {
 701             push @buf, $self->getWt($jcps);
 702         }
 703     }
 704
 705     # make sort key
 706     my @ret = ([],[],[],[]);
 707     my $last_is_variable;
 708
 709     foreach my $vwt (@buf) {
 710         my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 711
 712         # "Ignorable (L1, L2) after Variable" since track. v. 9
 713         if ($v2i) {
 714             if ($var) {
 715                 $last_is_variable = TRUE;
 716             }
 717             elsif (!$wt[0]) { # ignorable
 718                 next if $last_is_variable;
 719             }
 720             else {
 721                 $last_is_variable = FALSE;
 722             }
 723         }
 724         foreach my $v (0..$lev-1) {
 725             0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
 726         }
 727     }
 728
 729     # modification of tertiary weights
 730     if ($self->{upper_before_lower}) {
 731         foreach my $w (@{ $ret[2] }) {
 732             if    (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
 733             elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
 734             elsif ($w == 0x1C)             { $w += 1 } # square upper
 735             elsif ($w == 0x1D)             { $w -= 1 } # square lower
 736         }
 737     }
 738     if ($self->{katakana_before_hiragana}) {
 739         foreach my $w (@{ $ret[2] }) {
 740             if    (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
 741             elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
 742         }
 743     }
 744
 745     if ($self->{backwardsFlag}) {
 746         for (my $v = MinLevel; $v <= MaxLevel; $v++) {
 747             if ($self->{backwardsFlag} & (1 << $v)) {
 748                 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
 749             }
 750         }
 751     }
 752
 753     join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
 754 }
 755
 756
 757 ##
 758 ## int compare = cmp(string a, string b)
 759 ##
 760 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 761 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 762 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 763 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 764 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 765 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 766 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 767
 768 ##
 769 ## list[strings] sorted = sort(list[strings] arg)
 770 ##
 771 sub sort {
 772     my $obj = shift;
 773     return
 774         map { $_->[1] }
 775             sort{ $a->[0] cmp $b->[0] }
 776                 map [ $obj->getSortKey($_), $_ ], @_;
 777 }
 778
 779
 780 sub _derivCE_20 {
 781     my $u = shift;
 782     my $base = (CJK_UidIni  <= $u && $u <= CJK_UidF52) ? 0xFB40 : # CJK
 783                (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 784                 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
 785                 CJK_ExtCIni <= $u && $u <= CJK_ExtCFin) ? 0xFB80  # CJK ext.
 786                                                         : 0xFBC0; # others
 787     my $aaaa = $base + ($u >> 15);
 788     my $bbbb = ($u & 0x7FFF) | 0x8000;
 789     return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 790            pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 791 }
 792
 793 sub _derivCE_18 {
 794     my $u = shift;
 795     my $base = (CJK_UidIni  <= $u && $u <= CJK_UidF51) ? 0xFB40 : # CJK
 796                (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 797                 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80  # CJK ext.
 798                                                         : 0xFBC0; # others
 799     my $aaaa = $base + ($u >> 15);
 800     my $bbbb = ($u & 0x7FFF) | 0x8000;
 801     return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 802            pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 803 }
 804
 805 sub _derivCE_14 {
 806     my $u = shift;
 807     my $base = (CJK_UidIni  <= $u && $u <= CJK_UidF41) ? 0xFB40 : # CJK
 808                (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 809                 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80  # CJK ext.
 810                                                         : 0xFBC0; # others
 811     my $aaaa = $base + ($u >> 15);
 812     my $bbbb = ($u & 0x7FFF) | 0x8000;
 813     return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 814            pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 815 }
 816
 817 sub _derivCE_9 {
 818     my $u = shift;
 819     my $base = (CJK_UidIni  <= $u && $u <= CJK_UidFin) ? 0xFB40 : # CJK
 820                (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 821                 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) ? 0xFB80  # CJK ext.
 822                                                         : 0xFBC0; # others
 823     my $aaaa = $base + ($u >> 15);
 824     my $bbbb = ($u & 0x7FFF) | 0x8000;
 825     return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 826            pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 827 }
 828
 829 sub _derivCE_8 {
 830     my $code = shift;
 831     my $aaaa =  0xFF80 + ($code >> 15);
 832     my $bbbb = ($code & 0x7FFF) | 0x8000;
 833     return
 834         pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
 835         pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
 836 }
 837
 838 sub _uideoCE_8 {
 839     my $u = shift;
 840     return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
 841 }
 842
 843 sub _isUIdeo {
 844     my ($u, $uca_vers) = @_;
 845     return((CJK_UidIni <= $u && (
 846             $uca_vers >= 20 ? ($u <= CJK_UidF52) :
 847             $uca_vers >= 18 ? ($u <= CJK_UidF51) :
 848             $uca_vers >= 14 ? ($u <= CJK_UidF41) :
 849                               ($u <= CJK_UidFin)))
 850                 ||
 851         (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
 852                 ||
 853         (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 854                 ||
 855         ($uca_vers >= 20 &&
 856          CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
 857     );
 858 }
 859
 860
 861 sub getWtHangulTerm {
 862     my $self = shift;
 863     return _varCE($self->{variable},
 864         pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
 865 }
 866
 867
 868 ##
 869 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 870 ##
 871 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 872
 873 #
 874 # $code *must* be in Hangul syllable.
 875 # Check it before you enter here.
 876 #
 877 sub _decompHangul {
 878     my $code = shift;
 879     my $si = $code - Hangul_SBase;
 880     my $li = int( $si / Hangul_NCount);
 881     my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
 882     my $ti =      $si % Hangul_TCount;
 883     return (
 884         Hangul_LBase + $li,
 885         Hangul_VBase + $vi,
 886         $ti ? (Hangul_TBase + $ti) : (),
 887     );
 888 }
 889
 890 sub _isIllegal {
 891     my $code = shift;
 892     return ! defined $code                      # removed
 893         || ($code < 0 || 0x10FFFF < $code)      # out of range
 894         || (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF] (cf. utf8.c)
 895         || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
 896         || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
 897     ;
 898 }
 899
 900 # Hangul Syllable Type
 901 sub getHST {
 902     my $u = shift;
 903     my $vers = shift || 0;
 904
 905     if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
 906         return +($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV";
 907     }
 908
 909     if ($vers < 20) {
 910         return Hangul_LIni <= $u && $u <= Hangul_LFin ||
 911                                     $u == Hangul_LFill ? "L" :
 912                Hangul_VIni <= $u && $u <= Hangul_VFin  ? "V" :
 913                Hangul_TIni <= $u && $u <= Hangul_TFin  ? "T" : "";
 914     } else {
 915         return Hangul_LIni <= $u && $u <= Hangul_LEnd ||
 916                HangulL2Ini <= $u && $u <= HangulL2Fin  ? "L" :
 917                Hangul_VIni <= $u && $u <= Hangul_VEnd ||
 918                HangulV2Ini <= $u && $u <= HangulV2Fin  ? "V" :
 919                Hangul_TIni <= $u && $u <= Hangul_TEnd ||
 920                HangulT2Ini <= $u && $u <= HangulT2Fin  ? "T" : "";
 921     }
 922 }
 923
 924
 925 ##
 926 ## bool _nonIgnorAtLevel(arrayref weights, int level)
 927 ##
 928 sub _nonIgnorAtLevel($$)
 929 {
 930     my $wt = shift;
 931     return if ! defined $wt;
 932     my $lv = shift;
 933     return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
 934 }
 935
 936 ##
 937 ## bool _eqArray(
 938 ##    arrayref of arrayref[weights] source,
 939 ##    arrayref of arrayref[weights] substr,
 940 ##    int level)
 941 ## * comparison of graphemes vs graphemes.
 942 ##   @$source >= @$substr must be true (check it before call this);
 943 ##
 944 sub _eqArray($$$)
 945 {
 946     my $source = shift;
 947     my $substr = shift;
 948     my $lev = shift;
 949
 950     for my $g (0..@$substr-1){
 951         # Do the $g'th graphemes have the same number of AV weigths?
 952         return if @{ $source->[$g] } != @{ $substr->[$g] };
 953
 954         for my $w (0..@{ $substr->[$g] }-1) {
 955             for my $v (0..$lev-1) {
 956                 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
 957             }
 958         }
 959     }
 960     return 1;
 961 }
 962
 963 ##
 964 ## (int position, int length)
 965 ## int position = index(string, substring, position, [undoc'ed grobal])
 966 ##
 967 ## With "grobal" (only for the list context),
 968 ##  returns list of arrayref[position, length].
 969 ##
 970 sub index
 971 {
 972     my $self = shift;
 973     my $str  = shift;
 974     my $len  = length($str);
 975     my $subE = $self->splitEnt(shift);
 976     my $pos  = @_ ? shift : 0;
 977        $pos  = 0 if $pos < 0;
 978     my $grob = shift;
 979
 980     my $lev  = $self->{level};
 981     my $v2i  = $self->{UCA_Version} >= 9 &&
 982                 $self->{variable} ne 'non-ignorable';
 983
 984     if (! @$subE) {
 985         my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
 986         return $grob
 987             ? map([$_, 0], $temp..$len)
 988             : wantarray ? ($temp,0) : $temp;
 989     }
 990     $len < $pos
 991         and return wantarray ? () : NOMATCHPOS;
 992     my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
 993     @$strE
 994         or return wantarray ? () : NOMATCHPOS;
 995
 996     my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
 997
 998     my $last_is_variable;
 999     for my $vwt (map $self->getWt($_), @$subE) {
1000         my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
1001         my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
1002
1003         # "Ignorable (L1, L2) after Variable" since track. v. 9
1004         if ($v2i) {
1005             if ($var) {
1006                 $last_is_variable = TRUE;
1007             }
1008             elsif (!$wt[0]) { # ignorable
1009                 $to_be_pushed = FALSE if $last_is_variable;
1010             }
1011             else {
1012                 $last_is_variable = FALSE;
1013             }
1014         }
1015
1016         if (@subWt && !$var && !$wt[0]) {
1017             push @{ $subWt[-1] }, \@wt if $to_be_pushed;
1018         } else {
1019             push @subWt, [ \@wt ];
1020         }
1021     }
1022
1023     my $count = 0;
1024     my $end = @$strE - 1;
1025
1026     $last_is_variable = FALSE; # reuse
1027     for (my $i = 0; $i <= $end; ) { # no $i++
1028         my $found_base = 0;
1029
1030         # fetch a grapheme
1031         while ($i <= $end && $found_base == 0) {
1032             for my $vwt ($self->getWt($strE->[$i][0])) {
1033                 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
1034                 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
1035
1036                 # "Ignorable (L1, L2) after Variable" since track. v. 9
1037                 if ($v2i) {
1038                     if ($var) {
1039                         $last_is_variable = TRUE;
1040                     }
1041                     elsif (!$wt[0]) { # ignorable
1042                         $to_be_pushed = FALSE if $last_is_variable;
1043                     }
1044                     else {
1045                         $last_is_variable = FALSE;
1046                     }
1047                 }
1048
1049                 if (@strWt && !$var && !$wt[0]) {
1050                     push @{ $strWt[-1] }, \@wt if $to_be_pushed;
1051                     $finPos[-1] = $strE->[$i][2];
1052                 } elsif ($to_be_pushed) {
1053                     push @strWt, [ \@wt ];
1054                     push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
1055                     $finPos[-1] = NOMATCHPOS if $found_base;
1056                     push @finPos, $strE->[$i][2];
1057                     $found_base++;
1058                 }
1059                 # else ===> no-op
1060             }
1061             $i++;
1062         }
1063
1064         # try to match
1065         while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1066             if ($iniPos[0] != NOMATCHPOS &&
1067                     $finPos[$#subWt] != NOMATCHPOS &&
1068                         _eqArray(\@strWt, \@subWt, $lev)) {
1069                 my $temp = $iniPos[0] + $pos;
1070
1071                 if ($grob) {
1072                     push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1073                     splice @strWt,  0, $#subWt;
1074                     splice @iniPos, 0, $#subWt;
1075                     splice @finPos, 0, $#subWt;
1076                 }
1077                 else {
1078                     return wantarray
1079                         ? ($temp, $finPos[$#subWt] - $iniPos[0])
1080                         :  $temp;
1081                 }
1082             }
1083             shift @strWt;
1084             shift @iniPos;
1085             shift @finPos;
1086         }
1087     }
1088
1089     return $grob
1090         ? @g_ret
1091         : wantarray ? () : NOMATCHPOS;
1092 }
1093
1094 ##
1095 ## scalarref to matching part = match(string, substring)
1096 ##
1097 sub match
1098 {
1099     my $self = shift;
1100     if (my($pos,$len) = $self->index($_[0], $_[1])) {
1101         my $temp = substr($_[0], $pos, $len);
1102         return wantarray ? $temp : \$temp;
1103         # An lvalue ref \substr should be avoided,
1104         # since its value is affected by modification of its referent.
1105     }
1106     else {
1107         return;
1108     }
1109 }
1110
1111 ##
1112 ## arrayref matching parts = gmatch(string, substring)
1113 ##
1114 sub gmatch
1115 {
1116     my $self = shift;
1117     my $str  = shift;
1118     my $sub  = shift;
1119     return map substr($str, $_->[0], $_->[1]),
1120                 $self->index($str, $sub, 0, 'g');
1121 }
1122
1123 ##
1124 ## bool subst'ed = subst(string, substring, replace)
1125 ##
1126 sub subst
1127 {
1128     my $self = shift;
1129     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1130
1131     if (my($pos,$len) = $self->index($_[0], $_[1])) {
1132         if ($code) {
1133             my $mat = substr($_[0], $pos, $len);
1134             substr($_[0], $pos, $len, $code->($mat));
1135         } else {
1136             substr($_[0], $pos, $len, $_[2]);
1137         }
1138         return TRUE;
1139     }
1140     else {
1141         return FALSE;
1142     }
1143 }
1144
1145 ##
1146 ## int count = gsubst(string, substring, replace)
1147 ##
1148 sub gsubst
1149 {
1150     my $self = shift;
1151     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1152     my $cnt = 0;
1153
1154     # Replacement is carried out from the end, then use reverse.
1155     for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1156         if ($code) {
1157             my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1158             substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1159         } else {
1160             substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1161         }
1162         $cnt++;
1163     }
1164     return $cnt;
1165 }
1166
1167 1;
1168 __END__
1169
1170 =head1 NAME
1171
1172 Unicode::Collate - Unicode Collation Algorithm
1173
1174 =head1 SYNOPSIS
1175
1176   use Unicode::Collate;
1177
1178   #construct
1179   $Collator = Unicode::Collate->new(%tailoring);
1180
1181   #sort
1182   @sorted = $Collator->sort(@not_sorted);
1183
1184   #compare
1185   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1186
1187   # If %tailoring is false (i.e. empty),
1188   # $Collator should do the default collation.
1189
1190 =head1 DESCRIPTION
1191
1192 This module is an implementation of Unicode Technical Standard #10
1193 (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1194
1195 =head2 Constructor and Tailoring
1196
1197 The C<new> method returns a collator object.
1198
1199    $Collator = Unicode::Collate->new(
1200       UCA_Version => $UCA_Version,
1201       alternate => $alternate, # deprecated: use of 'variable' is recommended.
1202       backwards => $levelNumber, # or \@levelNumbers
1203       entry => $element,
1204       hangul_terminator => $term_primary_weight,
1205       ignoreName => qr/$ignoreName/,
1206       ignoreChar => qr/$ignoreChar/,
1207       katakana_before_hiragana => $bool,
1208       level => $collationLevel,
1209       normalization  => $normalization_form,
1210       overrideCJK => \&overrideCJK,
1211       overrideHangul => \&overrideHangul,
1212       preprocess => \&preprocess,
1213       rearrange => \@charList,
1214       table => $filename,
1215       undefName => qr/$undefName/,
1216       undefChar => qr/$undefChar/,
1217       upper_before_lower => $bool,
1218       variable => $variable,
1219    );
1220
1221 =over 4
1222
1223 =item UCA_Version
1224
1225 If the tracking version number of UCA is given,
1226 behavior of that tracking version is emulated on collating.
1227 If omitted, the return value of C<UCA_Version()> is used.
1228 C<UCA_Version()> should return the latest tracking version supported.
1229
1230 The supported tracking version: 8, 9, 11, 14, 16, 18 or 20.
1231
1232      UCA       Unicode Standard         DUCET (@version)
1233      ---------------------------------------------------
1234       8              3.1                3.0.1 (3.0.1d9)
1235       9     3.1 with Corrigendum 3      3.1.1 (3.1.1)
1236      11              4.0                4.0.0 (4.0.0)
1237      14             4.1.0               4.1.0 (4.1.0)
1238      16              5.0                5.0.0 (5.0.0)
1239      18             5.1.0               5.1.0 (5.1.0)
1240      20             5.2.0               5.2.0 (5.2.0)
1241
1242 Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1243
1244 =item alternate
1245
1246 -- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1247
1248 For backward compatibility, C<alternate> (old name) can be used
1249 as an alias for C<variable>.
1250
1251 =item backwards
1252
1253 -- see 3.1.2 French Accents, UTS #10.
1254
1255      backwards => $levelNumber or \@levelNumbers
1256
1257 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1258 If omitted, forwards at all the levels.
1259
1260 =item entry
1261
1262 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1263
1264 If the same character (or a sequence of characters) exists
1265 in the collation element table through C<table>,
1266 mapping to collation elements is overrided.
1267 If it does not exist, the mapping is defined additionally.
1268
1269     entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1270 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1271 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1272 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1273 006C 006C ; [.0F4C.0020.0002.006C] # ll
1274 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1275 004C 004C ; [.0F4C.0020.0008.004C] # LL
1276 00F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1277 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1278 00D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1279 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1280 ENTRY
1281
1282     entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1283 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1284 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1285 ENTRY
1286
1287 B<NOTE:> The code point in the UCA file format (before C<';'>)
1288 B<must> be a Unicode code point (defined as hexadecimal),
1289 but not a native code point.
1290 So C<0063> must always denote C<U+0063>,
1291 but not a character of C<"\x63">.
1292
1293 Weighting may vary depending on collation element table.
1294 So ensure the weights defined in C<entry> will be consistent with
1295 those in the collation element table loaded via C<table>.
1296
1297 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1298 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1299 (as a value between C<0E60> and C<0E6D>)
1300 makes ordering as C<C E<lt> CH E<lt> D>.
1301 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1302 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1303 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1304 and C<c-curl> (C<U+0255>) with C<0E69>.
1305 Then primary weight C<0E6A> for C<CH> makes C<CH>
1306 ordered between C<c-curl> and C<D>.
1307
1308 =item hangul_terminator
1309
1310 -- see 7.1.4 Trailing Weights, UTS #10.
1311
1312 If a true value is given (non-zero but should be positive),
1313 it will be added as a terminator primary weight to the end of
1314 every standard Hangul syllable. Secondary and any higher weights
1315 for terminator are set to zero.
1316 If the value is false or C<hangul_terminator> key does not exist,
1317 insertion of terminator weights will not be performed.
1318
1319 Boundaries of Hangul syllables are determined
1320 according to conjoining Jamo behavior in F<the Unicode Standard>
1321 and F<HangulSyllableType.txt>.
1322
1323 B<Implementation Note:>
1324 (1) For expansion mapping (Unicode character mapped
1325 to a sequence of collation elements), a terminator will not be added
1326 between collation elements, even if Hangul syllable boundary exists there.
1327 Addition of terminator is restricted to the next position
1328 to the last collation element.
1329
1330 (2) Non-conjoining Hangul letters
1331 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1332 automatically terminated with a terminator primary weight.
1333 These characters may need terminator included in a collation element
1334 table beforehand.
1335
1336 =item ignoreChar
1337
1338 =item ignoreName
1339
1340 -- see 3.2.2 Variable Weighting, UTS #10.
1341
1342 Makes the entry in the table completely ignorable;
1343 i.e. as if the weights were zero at all level.
1344
1345 Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1346 will be ignored. Through C<ignoreName>, any character whose name
1347 (given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1348 will be ignored.
1349
1350 E.g. when 'a' and 'e' are ignorable,
1351 'element' is equal to 'lament' (or 'lmnt').
1352
1353 =item katakana_before_hiragana
1354
1355 -- see 7.3.1 Tertiary Weight Table, UTS #10.
1356
1357 By default, hiragana is before katakana.
1358 If the parameter is made true, this is reversed.
1359
1360 B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1361 distinctions must occur in level 3, and their weights at level 3 must be
1362 same as those mentioned in 7.3.1, UTS #10.
1363 If you define your collation elements which violate this requirement,
1364 this parameter does not work validly.
1365
1366 =item level
1367
1368 -- see 4.3 Form Sort Key, UTS #10.
1369
1370 Set the maximum level.
1371 Any higher levels than the specified one are ignored.
1372
1373   Level 1: alphabetic ordering
1374   Level 2: diacritic ordering
1375   Level 3: case ordering
1376   Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1377
1378   ex.level => 2,
1379
1380 If omitted, the maximum is the 4th.
1381
1382 =item normalization
1383
1384 -- see 4.1 Normalize, UTS #10.
1385
1386 If specified, strings are normalized before preparation of sort keys
1387 (the normalization is executed after preprocess).
1388
1389 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1390 as C<$normalization_form>.
1391 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1392 See C<Unicode::Normalize::normalize()> for detail.
1393 If omitted, C<'NFD'> is used.
1394
1395 C<normalization> is performed after C<preprocess> (if defined).
1396
1397 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1398 though they are not concerned with C<Unicode::Normalize::normalize()>.
1399
1400 If C<undef> (not a string C<"undef">) is passed explicitly
1401 as the value for this key,
1402 any normalization is not carried out (this may make tailoring easier
1403 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1404 only contiguous contractions are resolved;
1405 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1406 C<A-cedilla-ring> would be primary equal to C<A>.
1407 In this point,
1408 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1409 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1410
1411 In the case of C<(normalization =E<gt> "prenormalized")>,
1412 any normalization is not performed, but
1413 non-contiguous contractions with combining characters are performed.
1414 Therefore
1415 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1416 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1417 If source strings are finely prenormalized,
1418 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1419
1420 Except C<(normalization =E<gt> undef)>,
1421 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1422
1423 =item overrideCJK
1424
1425 -- see 7.1 Derived Collation Elements, UTS #10.
1426
1427 By default, CJK Unified Ideographs are ordered in Unicode codepoint
1428 order but C<CJK Unified Ideographs> are lesser than
1429 C<CJK Unified Ideographs Extension>.
1430
1431     CJK Unified Ideographs:
1432     U+4E00..U+9FA5 if UCA_Version is 8 to 11;
1433     U+4E00..U+9FBB if UCA_Version is 14 to 16;
1434     U+4E00..U+9FC3 if UCA_Version is 18;
1435     U+4E00..U+9FCB if UCA_Version> is 20.
1436
1437     CJK Unified Ideographs Extension:
1438     Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) if UCA_Version < 20;
1439     Ext.A, Ext.B and Ext.C (U+2A700..U+2B734) if UCA_Version is 20.
1440
1441 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1442
1443 ex. CJK Unified Ideographs in the JIS code point order.
1444
1445   overrideCJK => sub {
1446       my $u = shift;             # get a Unicode codepoint
1447       my $b = pack('n', $u);     # to UTF-16BE
1448       my $s = your_unicode_to_sjis_converter($b); # convert
1449       my $n = unpack('n', $s);   # convert sjis to short
1450       [ $n, 0x20, 0x2, $u ];     # return the collation element
1451   },
1452
1453 ex. ignores all CJK Unified Ideographs.
1454
1455   overrideCJK => sub {()}, # CODEREF returning empty list
1456
1457    # where ->eq("Pe\x{4E00}rl", "Perl") is true
1458    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1459
1460 If C<undef> is passed explicitly as the value for this key,
1461 weights for CJK Unified Ideographs are treated as undefined.
1462 But assignment of weight for CJK Unified Ideographs
1463 in table or C<entry> is still valid.
1464
1465 =item overrideHangul
1466
1467 -- see 7.1 Derived Collation Elements, UTS #10.
1468
1469 By default, Hangul Syllables are decomposed into Hangul Jamo,
1470 even if C<(normalization =E<gt> undef)>.
1471 But the mapping of Hangul Syllables may be overrided.
1472
1473 This parameter works like C<overrideCJK>, so see there for examples.
1474
1475 If you want to override the mapping of Hangul Syllables,
1476 NFD, NFKD, and FCD are not appropriate,
1477 since they will decompose Hangul Syllables before overriding.
1478
1479 If C<undef> is passed explicitly as the value for this key,
1480 weight for Hangul Syllables is treated as undefined
1481 without decomposition into Hangul Jamo.
1482 But definition of weight for Hangul Syllables
1483 in table or C<entry> is still valid.
1484
1485 =item preprocess
1486
1487 -- see 5.1 Preprocessing, UTS #10.
1488
1489 If specified, the coderef is used to preprocess
1490 before the formation of sort keys.
1491
1492 ex. dropping English articles, such as "a" or "the".
1493 Then, "the pen" is before "a pencil".
1494
1495      preprocess => sub {
1496            my $str = shift;
1497            $str =~ s/\b(?:an?|the)\s+//gi;
1498            return $str;
1499         },
1500
1501 C<preprocess> is performed before C<normalization> (if defined).
1502
1503 =item rearrange
1504
1505 -- see 3.1.3 Rearrangement, UTS #10.
1506
1507 Characters that are not coded in logical order and to be rearranged.
1508 If C<UCA_Version> is equal to or lesser than 11, default is:
1509
1510     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1511
1512 If you want to disallow any rearrangement, pass C<undef> or C<[]>
1513 (a reference to empty list) as the value for this key.
1514
1515 If C<UCA_Version> is equal to or greater than 14, default is C<[]>
1516 (i.e. no rearrangement).
1517
1518 B<According to the version 9 of UCA, this parameter shall not be used;
1519 but it is not warned at present.>
1520
1521 =item table
1522
1523 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1524
1525 You can use another collation element table if desired.
1526
1527 The table file should locate in the F<Unicode/Collate> directory
1528 on C<@INC>. Say, if the filename is F<Foo.txt>,
1529 the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1530
1531 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1532 If you will prepare your own table file, any name other than F<allkeys.txt>
1533 may be better to avoid namespace conflict.
1534
1535 B<NOTE>: When XSUB is used, the DUCET is compiled on building this
1536 module, and it may save time at the run time.
1537 Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
1538 or using C<ignoreChar>, C<ignoreName>, C<undefChar>, or C<undefName>
1539 will prevent this module using the compiled DUCET.
1540
1541 If C<undef> is passed explicitly as the value for this key,
1542 no file is read (but you can define collation elements via C<entry>).
1543
1544 A typical way to define a collation element table
1545 without any file of table:
1546
1547    $onlyABC = Unicode::Collate->new(
1548        table => undef,
1549        entry => << 'ENTRIES',
1550 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1551 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1552 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1553 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1554 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1555 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1556 ENTRIES
1557     );
1558
1559 If C<ignoreName> or C<undefName> is used, character names should be
1560 specified as a comment (following C<#>) on each line.
1561
1562 =item undefChar
1563
1564 =item undefName
1565
1566 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1567
1568 Undefines the collation element as if it were unassigned in the table.
1569 This reduces the size of the table.
1570 If an unassigned character appears in the string to be collated,
1571 the sort key is made from its codepoint
1572 as a single-character collation element,
1573 as it is greater than any other assigned collation elements
1574 (in the codepoint order among the unassigned characters).
1575 But, it'd be better to ignore characters
1576 unfamiliar to you and maybe never used.
1577
1578 Through C<undefChar>, any character matching C<qr/$undefChar/>
1579 will be undefined. Through C<undefName>, any character whose name
1580 (given in the C<table> file as a comment) matches C<qr/$undefName/>
1581 will be undefined.
1582
1583 ex. Collation weights for beyond-BMP characters are not stored in object:
1584
1585     undefChar => qr/[^\0-\x{fffd}]/,
1586
1587 =item upper_before_lower
1588
1589 -- see 6.6 Case Comparisons, UTS #10.
1590
1591 By default, lowercase is before uppercase.
1592 If the parameter is made true, this is reversed.
1593
1594 B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1595 distinctions must occur in level 3, and their weights at level 3 must be
1596 same as those mentioned in 7.3.1, UTS #10.
1597 If you define your collation elements which differs from this requirement,
1598 this parameter doesn't work validly.
1599
1600 =item variable
1601
1602 -- see 3.2.2 Variable Weighting, UTS #10.
1603
1604 This key allows to variable weighting for variable collation elements,
1605 which are marked with an ASTERISK in the table
1606 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1607
1608    variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1609
1610 These names are case-insensitive.
1611 By default (if specification is omitted), 'shifted' is adopted.
1612
1613    'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1614                     considered at the 4th level.
1615
1616    'Non-Ignorable'  Variable elements are not reset to ignorable.
1617
1618    'Shifted'        Variable elements are made ignorable at levels 1 through 3
1619                     their level 4 weight is replaced by the old level 1 weight.
1620                     Level 4 weight for Non-Variable elements is 0xFFFF.
1621
1622    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1623                     are trimmed.
1624
1625 =back
1626
1627 =head2 Methods for Collation
1628
1629 =over 4
1630
1631 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1632
1633 Sorts a list of strings.
1634
1635 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1636
1637 Returns 1 (when C<$a> is greater than C<$b>)
1638 or 0 (when C<$a> is equal to C<$b>)
1639 or -1 (when C<$a> is lesser than C<$b>).
1640
1641 =item C<$result = $Collator-E<gt>eq($a, $b)>
1642
1643 =item C<$result = $Collator-E<gt>ne($a, $b)>
1644
1645 =item C<$result = $Collator-E<gt>lt($a, $b)>
1646
1647 =item C<$result = $Collator-E<gt>le($a, $b)>
1648
1649 =item C<$result = $Collator-E<gt>gt($a, $b)>
1650
1651 =item C<$result = $Collator-E<gt>ge($a, $b)>
1652
1653 They works like the same name operators as theirs.
1654
1655    eq : whether $a is equal to $b.
1656    ne : whether $a is not equal to $b.
1657    lt : whether $a is lesser than $b.
1658    le : whether $a is lesser than $b or equal to $b.
1659    gt : whether $a is greater than $b.
1660    ge : whether $a is greater than $b or equal to $b.
1661
1662 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1663
1664 -- see 4.3 Form Sort Key, UTS #10.
1665
1666 Returns a sort key.
1667
1668 You compare the sort keys using a binary comparison
1669 and get the result of the comparison of the strings using UCA.
1670
1671    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1672
1673       is equivalent to
1674
1675    $Collator->cmp($a, $b)
1676
1677 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1678
1679 Converts a sorting key into its representation form.
1680 If C<UCA_Version> is 8, the output is slightly different.
1681
1682    use Unicode::Collate;
1683    my $c = Unicode::Collate->new();
1684    print $c->viewSortKey("Perl"),"\n";
1685
1686    # output:
1687    # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1688    #  Level 1               Level 2               Level 3               Level 4
1689
1690 =back
1691
1692 =head2 Methods for Searching
1693
1694 B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1695 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1696 C<subst>, C<gsubst>) is croaked,
1697 as the position and the length might differ
1698 from those on the specified string.
1699 (And C<rearrange> and C<hangul_terminator> parameters are neglected.)
1700
1701 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1702 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1703 but they are not aware of any pattern, but only a literal substring.
1704
1705 =over 4
1706
1707 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1708
1709 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1710
1711 If C<$substring> matches a part of C<$string>, returns
1712 the position of the first occurrence of the matching part in scalar context;
1713 in list context, returns a two-element list of
1714 the position and the length of the matching part.
1715
1716 If C<$substring> does not match any part of C<$string>,
1717 returns C<-1> in scalar context and
1718 an empty list in list context.
1719
1720 e.g. you say
1721
1722   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1723                                      # (normalization => undef) is REQUIRED.
1724   my $str = "Ich muß studieren Perl.";
1725   my $sub = "MÜSS";
1726   my $match;
1727   if (my($pos,$len) = $Collator->index($str, $sub)) {
1728       $match = substr($str, $pos, $len);
1729   }
1730
1731 and get C<"muß"> in C<$match> since C<"muß">
1732 is primary equal to C<"MÜSS">.
1733
1734 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1735
1736 =item C<($match)   = $Collator-E<gt>match($string, $substring)>
1737
1738 If C<$substring> matches a part of C<$string>, in scalar context, returns
1739 B<a reference to> the first occurrence of the matching part
1740 (C<$match_ref> is always true if matches,
1741 since every reference is B<true>);
1742 in list context, returns the first occurrence of the matching part.
1743
1744 If C<$substring> does not match any part of C<$string>,
1745 returns C<undef> in scalar context and
1746 an empty list in list context.
1747
1748 e.g.
1749
1750     if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1751         print "matches [$$match_ref].\n";
1752     } else {
1753         print "doesn't match.\n";
1754     }
1755
1756      or
1757
1758     if (($match) = $Collator->match($str, $sub)) { # list context
1759         print "matches [$match].\n";
1760     } else {
1761         print "doesn't match.\n";
1762     }
1763
1764 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1765
1766 If C<$substring> matches a part of C<$string>, returns
1767 all the matching parts (or matching count in scalar context).
1768
1769 If C<$substring> does not match any part of C<$string>,
1770 returns an empty list.
1771
1772 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1773
1774 If C<$substring> matches a part of C<$string>,
1775 the first occurrence of the matching part is replaced by C<$replacement>
1776 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1777
1778 C<$replacement> can be a C<CODEREF>,
1779 taking the matching part as an argument,
1780 and returning a string to replace the matching part
1781 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1782
1783 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1784
1785 If C<$substring> matches a part of C<$string>,
1786 all the occurrences of the matching part is replaced by C<$replacement>
1787 (C<$string> is modified) and return C<$count>.
1788
1789 C<$replacement> can be a C<CODEREF>,
1790 taking the matching part as an argument,
1791 and returning a string to replace the matching part
1792 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1793
1794 e.g.
1795
1796   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1797                                      # (normalization => undef) is REQUIRED.
1798   my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
1799   $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1800
1801   # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1802   # i.e., all the camels are made bold-faced.
1803
1804 =back
1805
1806 =head2 Other Methods
1807
1808 =over 4
1809
1810 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1811
1812 Change the value of specified keys and returns the changed part.
1813
1814     $Collator = Unicode::Collate->new(level => 4);
1815
1816     $Collator->eq("perl", "PERL"); # false
1817
1818     %old = $Collator->change(level => 2); # returns (level => 4).
1819
1820     $Collator->eq("perl", "PERL"); # true
1821
1822     $Collator->change(%old); # returns (level => 2).
1823
1824     $Collator->eq("perl", "PERL"); # false
1825
1826 Not all C<(key,value)>s are allowed to be changed.
1827 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1828
1829 In the scalar context, returns the modified collator
1830 (but it is B<not> a clone from the original).
1831
1832     $Collator->change(level => 2)->eq("perl", "PERL"); # true
1833
1834     $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1835
1836     $Collator->change(level => 4)->eq("perl", "PERL"); # false
1837
1838 =item C<$version = $Collator-E<gt>version()>
1839
1840 Returns the version number (a string) of the Unicode Standard
1841 which the C<table> file used by the collator object is based on.
1842 If the table does not include a version line (starting with C<@version>),
1843 returns C<"unknown">.
1844
1845 =item C<UCA_Version()>
1846
1847 Returns the tracking version number of UTS #10 this module consults.
1848
1849 =item C<Base_Unicode_Version()>
1850
1851 Returns the version number of UTS #10 this module consults.
1852
1853 =back
1854
1855 =head1 EXPORT
1856
1857 No method will be exported.
1858
1859 =head1 INSTALL
1860
1861 Though this module can be used without any C<table> file,
1862 to use this module easily, it is recommended to install a table file
1863 in the UCA format, by copying it under the directory
1864 <a place in @INC>/Unicode/Collate.
1865
1866 The most preferable one is "The Default Unicode Collation Element Table"
1867 (aka DUCET), available from the Unicode Consortium's website:
1868
1869    http://www.unicode.org/Public/UCA/
1870
1871    http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1872
1873 If DUCET is not installed, it is recommended to copy the file
1874 from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1875 to <a place in @INC>/Unicode/Collate/allkeys.txt
1876 manually.
1877
1878 =head1 CAVEATS
1879
1880 =over 4
1881
1882 =item Normalization
1883
1884 Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1885 module (see L<Unicode::Normalize>).
1886
1887 If you need not it (say, in the case when you need not
1888 handle any combining characters),
1889 assign C<normalization =E<gt> undef> explicitly.
1890
1891 -- see 6.5 Avoiding Normalization, UTS #10.
1892
1893 =item Conformance Test
1894
1895 The Conformance Test for the UCA is available
1896 under L<http://www.unicode.org/Public/UCA/>.
1897
1898 For F<CollationTest_SHIFTED.txt>,
1899 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1900 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1901 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1902
1903 B<Unicode::Normalize is required to try The Conformance Test.>
1904
1905 =back
1906
1907 =head1 AUTHOR, COPYRIGHT AND LICENSE
1908
1909 The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1910 <SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2010,
1911 SADAHIRO Tomoyuki. Japan. All rights reserved.
1912
1913 This module is free software; you can redistribute it and/or
1914 modify it under the same terms as Perl itself.
1915
1916 The file Unicode/Collate/allkeys.txt was copied verbatim
1917 from L<http://www.unicode.org/Public/UCA/5.2.0/allkeys.txt>.
1918 This file is Copyright (c) 1991-2009 Unicode, Inc. All rights reserved.
1919 Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
1920
1921 =head1 SEE ALSO
1922
1923 =over 4
1924
1925 =item Unicode Collation Algorithm - UTS #10
1926
1927 L<http://www.unicode.org/reports/tr10/>
1928
1929 =item The Default Unicode Collation Element Table (DUCET)
1930
1931 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1932
1933 =item The conformance test for the UCA
1934
1935 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1936
1937 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1938
1939 =item Hangul Syllable Type
1940
1941 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1942
1943 =item Unicode Normalization Forms - UAX #15
1944
1945 L<http://www.unicode.org/reports/tr15/>
1946
1947 =back
1948
1949 =cut