lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41)) {
   5         die "Unicode::Collate cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13 use File::Spec;
  14
  15 no warnings 'utf8';
  16
  17 our $VERSION = '0.40';
  18 our $PACKAGE = __PACKAGE__;
  19
  20 my @Path = qw(Unicode Collate);
  21 my $KeyFile = "allkeys.txt";
  22
  23 # Perl's boolean
  24 use constant TRUE  => 1;
  25 use constant FALSE => "";
  26 use constant NOMATCHPOS => -1;
  27
  28 # A coderef to get combining class imported from Unicode::Normalize
  29 # (i.e. \&Unicode::Normalize::getCombinClass).
  30 # This is also used as a HAS_UNICODE_NORMALIZE flag.
  31 my $CVgetCombinClass;
  32
  33 # Supported Levels
  34 use constant MinLevel => 1;
  35 use constant MaxLevel => 4;
  36
  37 # Minimum weights at level 2 and 3, respectively
  38 use constant Min2Wt => 0x20;
  39 use constant Min3Wt => 0x02;
  40
  41 # Shifted weight at 4th level
  42 use constant Shift4Wt => 0xFFFF;
  43
  44 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
  45 # PROBLEM: The Default Unicode Collation Element Table
  46 # has weights over 0xFFFF at the 4th level.
  47 # The tie-breaking in the variable weights
  48 # other than "shift" (as well as "shift-trimmed") is unreliable.
  49 use constant VCE_TEMPLATE => 'Cn4';
  50
  51 # A sort key: 16-bit weights
  52 # See also the PROBLEM on VCE_TEMPLATE above.
  53 use constant KEY_TEMPLATE => 'n*';
  54
  55 # Level separator in a sort key:
  56 # i.e. pack(KEY_TEMPLATE, 0)
  57 use constant LEVEL_SEP => "\0\0";
  58
  59 # As Unicode code point separator for hash keys.
  60 # A joined code point string (denoted by JCPS below)
  61 # like "65;768" is used for internal processing
  62 # instead of Perl's Unicode string like "\x41\x{300}",
  63 # as the native code point is different from the Unicode code point
  64 # on EBCDIC platform.
  65 # This character must not be included in any stringified
  66 # representation of an integer.
  67 use constant CODE_SEP => ';';
  68
  69 # boolean values of variable weights
  70 use constant NON_VAR => 0; # Non-Variable character
  71 use constant VAR     => 1; # Variable character
  72
  73 # specific code points
  74 use constant Hangul_LBase  => 0x1100;
  75 use constant Hangul_LIni   => 0x1100;
  76 use constant Hangul_LFin   => 0x1159;
  77 use constant Hangul_LFill  => 0x115F;
  78 use constant Hangul_VBase  => 0x1161;
  79 use constant Hangul_VIni   => 0x1160;
  80 use constant Hangul_VFin   => 0x11A2;
  81 use constant Hangul_TBase  => 0x11A7;
  82 use constant Hangul_TIni   => 0x11A8;
  83 use constant Hangul_TFin   => 0x11F9;
  84 use constant Hangul_TCount => 28;
  85 use constant Hangul_NCount => 588;
  86 use constant Hangul_SBase  => 0xAC00;
  87 use constant Hangul_SIni   => 0xAC00;
  88 use constant Hangul_SFin   => 0xD7A3;
  89 use constant CJK_UidIni    => 0x4E00;
  90 use constant CJK_UidFin    => 0x9FA5;
  91 use constant CJK_ExtAIni   => 0x3400;
  92 use constant CJK_ExtAFin   => 0x4DB5;
  93 use constant CJK_ExtBIni   => 0x20000;
  94 use constant CJK_ExtBFin   => 0x2A6D6;
  95 use constant BMP_Max       => 0xFFFF;
  96
  97 # Logical_Order_Exception in PropList.txt
  98 # TODO: synchronization with change of PropList.txt.
  99 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
 100
 101 sub UCA_Version { "11" }
 102
 103 sub Base_Unicode_Version { "4.0" }
 104
 105 ######
 106
 107 sub pack_U {
 108     return pack('U*', @_);
 109 }
 110
 111 sub unpack_U {
 112     return unpack('U*', pack('U*').shift);
 113 }
 114
 115 ######
 116
 117 my (%VariableOK);
 118 @VariableOK{ qw/
 119     blanked  non-ignorable  shifted  shift-trimmed
 120   / } = (); # keys lowercased
 121
 122 our @ChangeOK = qw/
 123     alternate backwards level normalization rearrange
 124     katakana_before_hiragana upper_before_lower
 125     overrideHangul overrideCJK preprocess UCA_Version
 126     hangul_terminator variable
 127   /;
 128
 129 our @ChangeNG = qw/
 130     entry mapping table maxlength
 131     ignoreChar ignoreName undefChar undefName variableTable
 132     versionTable alternateTable backwardsTable forwardsTable rearrangeTable
 133     derivCode normCode rearrangeHash
 134     backwardsFlag
 135   /;
 136 # The hash key 'ignored' is deleted at v 0.21.
 137 # The hash key 'isShift' is deleted at v 0.23.
 138 # The hash key 'combining' is deleted at v 0.24.
 139 # The hash key 'entries' is deleted at v 0.30.
 140 # The hash key 'L3_ignorable' is deleted at v 0.40.
 141
 142 sub version {
 143     my $self = shift;
 144     return $self->{versionTable} || 'unknown';
 145 }
 146
 147 my (%ChangeOK, %ChangeNG);
 148 @ChangeOK{ @ChangeOK } = ();
 149 @ChangeNG{ @ChangeNG } = ();
 150
 151 sub change {
 152     my $self = shift;
 153     my %hash = @_;
 154     my %old;
 155     if (exists $hash{variable} && exists $hash{alternate}) {
 156         delete $hash{alternate};
 157     }
 158     elsif (!exists $hash{variable} && exists $hash{alternate}) {
 159         $hash{variable} = $hash{alternate};
 160     }
 161     foreach my $k (keys %hash) {
 162         if (exists $ChangeOK{$k}) {
 163             $old{$k} = $self->{$k};
 164             $self->{$k} = $hash{$k};
 165         }
 166         elsif (exists $ChangeNG{$k}) {
 167             croak "change of $k via change() is not allowed!";
 168         }
 169         # else => ignored
 170     }
 171     $self->checkCollator;
 172     return wantarray ? %old : $self;
 173 }
 174
 175 sub _checkLevel {
 176     my $level = shift;
 177     my $key   = shift; # 'level' or 'backwards'
 178     MinLevel <= $level or croak sprintf
 179         "Illegal level %d (in value for key '%s') lower than %d.",
 180             $level, $key, MinLevel;
 181     $level <= MaxLevel or croak sprintf
 182         "Unsupported level %d (in value for key '%s') higher than %d.",
 183             $level, $key, MaxLevel;
 184 }
 185
 186 my %DerivCode = (
 187     8 => \&_derivCE_8,
 188     9 => \&_derivCE_9,
 189    11 => \&_derivCE_9, # 11 == 9
 190 );
 191
 192 sub checkCollator {
 193     my $self = shift;
 194     _checkLevel($self->{level}, "level");
 195
 196     $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
 197         or croak "Illegal UCA version (passed $self->{UCA_Version}).";
 198
 199     $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
 200                                 $self->{alternateTable} || 'shifted';
 201     $self->{variable} = $self->{alternate} = lc($self->{variable});
 202     exists $VariableOK{ $self->{variable} }
 203         or croak "$PACKAGE unknown variable tag name: $self->{variable}";
 204
 205     if (! defined $self->{backwards}) {
 206         $self->{backwardsFlag} = 0;
 207     }
 208     elsif (! ref $self->{backwards}) {
 209         _checkLevel($self->{backwards}, "backwards");
 210         $self->{backwardsFlag} = 1 << $self->{backwards};
 211     }
 212     else {
 213         my %level;
 214         $self->{backwardsFlag} = 0;
 215         for my $b (@{ $self->{backwards} }) {
 216             _checkLevel($b, "backwards");
 217             $level{$b} = 1;
 218         }
 219         for my $v (sort keys %level) {
 220             $self->{backwardsFlag} += 1 << $v;
 221         }
 222     }
 223
 224     defined $self->{rearrange} or $self->{rearrange} = [];
 225     ref $self->{rearrange}
 226         or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
 227
 228     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 229     $self->{rearrangeHash} = undef;
 230
 231     if (@{ $self->{rearrange} }) {
 232         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 233     }
 234
 235     $self->{normCode} = undef;
 236
 237     if (defined $self->{normalization}) {
 238         eval { require Unicode::Normalize };
 239         $@ and croak "Unicode::Normalize is required to normalize strings";
 240
 241         $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
 242
 243         if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
 244             $self->{normCode} = \&Unicode::Normalize::NFD;
 245         }
 246         elsif ($self->{normalization} ne 'prenormalized') {
 247             my $norm = $self->{normalization};
 248             $self->{normCode} = sub {
 249                 Unicode::Normalize::normalize($norm, shift);
 250             };
 251             eval { $self->{normCode}->("") }; # try
 252             $@ and croak "$PACKAGE unknown normalization form name: $norm";
 253         }
 254     }
 255     return;
 256 }
 257
 258 sub new
 259 {
 260     my $class = shift;
 261     my $self = bless { @_ }, $class;
 262
 263     # If undef is passed explicitly, no file is read.
 264     $self->{table} = $KeyFile if ! exists $self->{table};
 265     $self->read_table if defined $self->{table};
 266
 267     if ($self->{entry}) {
 268         while ($self->{entry} =~ /([^\n]+)/g) {
 269             $self->parseEntry($1);
 270         }
 271     }
 272
 273     $self->{level} ||= MaxLevel;
 274     $self->{UCA_Version} ||= UCA_Version();
 275
 276     $self->{overrideHangul} = FALSE
 277         if ! exists $self->{overrideHangul};
 278     $self->{overrideCJK} = FALSE
 279         if ! exists $self->{overrideCJK};
 280     $self->{normalization} = 'NFD'
 281         if ! exists $self->{normalization};
 282     $self->{rearrange} = $self->{rearrangeTable} || $DefaultRearrange
 283         if ! exists $self->{rearrange};
 284     $self->{backwards} = $self->{backwardsTable}
 285         if ! exists $self->{backwards};
 286
 287     $self->checkCollator;
 288
 289     return $self;
 290 }
 291
 292 sub read_table {
 293     my $self = shift;
 294
 295     my($f, $fh);
 296     foreach my $d (@INC) {
 297         $f = File::Spec->catfile($d, @Path, $self->{table});
 298         last if open($fh, $f);
 299         $f = undef;
 300     }
 301     defined $f
 302         or croak "$PACKAGE: $self->{table} is not found in @INC";
 303
 304     while (<$fh>) {
 305         next if /^\s*#/;
 306         unless (s/^\s*\@//) {
 307             $self->parseEntry($_);
 308             next;
 309         }
 310
 311         if (/^version\s*(\S*)/) {
 312             $self->{versionTable} ||= $1;
 313         }
 314         elsif (/^variable\s+(\S*)/) { # since UTS #10-9
 315             $self->{variableTable} ||= $1;
 316         }
 317         elsif (/^alternate\s+(\S*)/) { # till UTS #10-8
 318             $self->{alternateTable} ||= $1;
 319         }
 320         elsif (/^backwards\s+(\S*)/) {
 321             push @{ $self->{backwardsTable} }, $1;
 322         }
 323         elsif (/^forwards\s+(\S*)/) { # parhaps no use
 324             push @{ $self->{forwardsTable} }, $1;
 325         }
 326         elsif (/^rearrange\s+(.*)/) { # (\S*) is NG
 327             push @{ $self->{rearrangeTable} }, _getHexArray($1);
 328         }
 329     }
 330     close $fh;
 331 }
 332
 333
 334 ##
 335 ## get $line, parse it, and write an entry in $self
 336 ##
 337 sub parseEntry
 338 {
 339     my $self = shift;
 340     my $line = shift;
 341     my($name, $entry, @uv, @key);
 342
 343     return if $line !~ /^\s*[0-9A-Fa-f]/;
 344
 345     # removes comment and gets name
 346     $name = $1
 347         if $line =~ s/[#%]\s*(.*)//;
 348     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 349
 350     # gets element
 351     my($e, $k) = split /;/, $line;
 352     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 353         if ! $k;
 354
 355     @uv = _getHexArray($e);
 356     return if !@uv;
 357
 358     $entry = join(CODE_SEP, @uv); # in JCPS
 359
 360     if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
 361         my $ele = pack_U(@uv);
 362
 363         # regarded as if it were not entried in the table
 364         return
 365             if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 366
 367         # replaced as completely ignorable
 368         $k = '[.0000.0000.0000.0000]'
 369             if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
 370     }
 371
 372     # replaced as completely ignorable
 373     $k = '[.0000.0000.0000.0000]'
 374         if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
 375
 376     my $is_L3_ignorable = TRUE;
 377
 378     foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 379         my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 380         my @wt = _getHexArray($arr);
 381         push @key, pack(VCE_TEMPLATE, $var, @wt);
 382         $is_L3_ignorable = FALSE
 383             if $wt[0] + $wt[1] + $wt[2] != 0;
 384           # if $arr !~ /[1-9A-Fa-f]/; NG
 385           # Conformance Test shows L3-ignorable is completely ignorable.
 386         # For expansion, an entry $is_L3_ignorable
 387         # if and only if "all" CEs are [.0000.0000.0000].
 388     }
 389
 390     $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
 391
 392     if (@uv > 1) {
 393         (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
 394             and $self->{maxlength}{$uv[0]} = @uv;
 395     }
 396 }
 397
 398
 399 ##
 400 ## VCE = _varCE(variable term, VCE)
 401 ##
 402 sub _varCE
 403 {
 404     my $vbl = shift;
 405     my $vce = shift;
 406     if ($vbl eq 'non-ignorable') {
 407         return $vce;
 408     }
 409     my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
 410
 411     if ($var) {
 412         return pack(VCE_TEMPLATE, $var, 0, 0, 0,
 413                 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
 414     }
 415     elsif ($vbl eq 'blanked') {
 416         return $vce;
 417     }
 418     else {
 419         return pack(VCE_TEMPLATE, $var, @wt[0..2],
 420             $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
 421     }
 422 }
 423
 424 sub viewSortKey
 425 {
 426     my $self = shift;
 427     $self->visualizeSortKey($self->getSortKey(@_));
 428 }
 429
 430 sub visualizeSortKey
 431 {
 432     my $self = shift;
 433     my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
 434
 435     if ($self->{UCA_Version} <= 8) {
 436         $view =~ s/ ?0000 ?/|/g;
 437     } else {
 438         $view =~ s/\b0000\b/|/g;
 439     }
 440     return "[$view]";
 441 }
 442
 443
 444 ##
 445 ## arrayref of JCPS   = splitEnt(string to be collated)
 446 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
 447 ##
 448 sub splitEnt
 449 {
 450     my $self = shift;
 451     my $wLen = $_[1];
 452
 453     my $code = $self->{preprocess};
 454     my $norm = $self->{normCode};
 455     my $map  = $self->{mapping};
 456     my $max  = $self->{maxlength};
 457     my $reH  = $self->{rearrangeHash};
 458     my $ver9 = $self->{UCA_Version} >= 9;
 459
 460     my ($str, @buf);
 461
 462     if ($wLen) {
 463         $code and croak "Preprocess breaks character positions. "
 464                         . "Don't use with index(), match(), etc.";
 465         $norm and croak "Normalization breaks character positions. "
 466                         . "Don't use with index(), match(), etc.";
 467         $str = $_[0];
 468     }
 469     else {
 470         $str = $_[0];
 471         $str = &$code($str) if ref $code;
 472         $str = &$norm($str) if ref $norm;
 473     }
 474
 475     # get array of Unicode code point of string.
 476     my @src = unpack_U($str);
 477
 478     # rearrangement:
 479     # Character positions are not kept if rearranged,
 480     # then neglected if $wLen is true.
 481     if ($reH && ! $wLen) {
 482         for (my $i = 0; $i < @src; $i++) {
 483             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 484                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 485                 $i++;
 486             }
 487         }
 488     }
 489
 490     # To remove a character marked as a completely ignorable.
 491     for (my $i = 0; $i < @src; $i++) {
 492         $src[$i] = undef
 493             if _isIllegal($src[$i]) || ($ver9 &&
 494                 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
 495     }
 496
 497     for (my $i = 0; $i < @src; $i++) {
 498         my $jcps = $src[$i];
 499         next if ! defined $jcps;
 500         my $i_orig = $i;
 501
 502         if ($max->{$jcps}) { # contract
 503             my $temp_jcps = $jcps;
 504             my $jcpsLen = 1;
 505             my $maxLen = $max->{$jcps};
 506
 507             for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
 508                 next if ! defined $src[$p];
 509                 $temp_jcps .= CODE_SEP . $src[$p];
 510                 $jcpsLen++;
 511                 if ($map->{$temp_jcps}) {
 512                     $jcps = $temp_jcps;
 513                     $i = $p;
 514                 }
 515             }
 516
 517         # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
 518         # This process requires Unicode::Normalize.
 519         # If "normalization" is undef, here should be skipped *always*
 520         # (in spite of bool value of $CVgetCombinClass),
 521         # since canonical ordering cannot be expected.
 522         # Blocked combining character should not be contracted.
 523
 524             if ($self->{normalization})
 525             # $self->{normCode} is false in the case of "prenormalized".
 526             {
 527                 my $preCC = 0;
 528                 my $curCC = 0;
 529
 530                 for (my $p = $i + 1; $p < @src; $p++) {
 531                     next if ! defined $src[$p];
 532                     $curCC = $CVgetCombinClass->($src[$p]);
 533                     last unless $curCC;
 534                     my $tail = CODE_SEP . $src[$p];
 535                     if ($preCC != $curCC && $map->{$jcps.$tail}) {
 536                         $jcps .= $tail;
 537                         $src[$p] = undef;
 538                     } else {
 539                         $preCC = $curCC;
 540                     }
 541                 }
 542             }
 543         }
 544
 545         if ($wLen) {
 546             for (; $i + 1 < @src; $i++) {
 547                 last if defined $src[$i + 1];
 548             }
 549         }
 550
 551         push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
 552     }
 553     return \@buf;
 554 }
 555
 556
 557 ##
 558 ## list of VCE = getWt(JCPS)
 559 ##
 560 sub getWt
 561 {
 562     my $self = shift;
 563     my $u    = shift;
 564     my $vbl  = $self->{variable};
 565     my $map  = $self->{mapping};
 566     my $der  = $self->{derivCode};
 567
 568     return if !defined $u;
 569     return map(_varCE($vbl, $_), @{ $map->{$u} })
 570         if $map->{$u};
 571
 572     # JCPS must not be a contraction, then it's a code point.
 573     if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
 574         my $hang = $self->{overrideHangul};
 575         my @hangulCE;
 576         if ($hang) {
 577             @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
 578         }
 579         elsif (!defined $hang) {
 580             @hangulCE = $der->($u);
 581         }
 582         else {
 583             my $max  = $self->{maxlength};
 584             my @decH = _decompHangul($u);
 585
 586             if (@decH == 2) {
 587                 my $contract = join(CODE_SEP, @decH);
 588                 @decH = ($contract) if $map->{$contract};
 589             } else { # must be <@decH == 3>
 590                 if ($max->{$decH[0]}) {
 591                     my $contract = join(CODE_SEP, @decH);
 592                     if ($map->{$contract}) {
 593                         @decH = ($contract);
 594                     } else {
 595                         $contract = join(CODE_SEP, @decH[0,1]);
 596                         $map->{$contract} and @decH = ($contract, $decH[2]);
 597                     }
 598                     # even if V's ignorable, LT contraction is not supported.
 599                     # If such a situatution were required, NFD should be used.
 600                 }
 601                 if (@decH == 3 && $max->{$decH[1]}) {
 602                     my $contract = join(CODE_SEP, @decH[1,2]);
 603                     $map->{$contract} and @decH = ($decH[0], $contract);
 604                 }
 605             }
 606
 607             @hangulCE = map({
 608                     $map->{$_} ? @{ $map->{$_} } : $der->($_);
 609                 } @decH);
 610         }
 611         return map _varCE($vbl, $_), @hangulCE;
 612     }
 613     elsif (CJK_UidIni  <= $u && $u <= CJK_UidFin  ||
 614            CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 615            CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) {
 616         my $cjk  = $self->{overrideCJK};
 617         return map _varCE($vbl, $_),
 618             $cjk
 619                 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
 620                 : defined $cjk && $self->{UCA_Version} <= 8 && $u <= BMP_Max
 621                     ? pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u)
 622                     : $der->($u);
 623     }
 624     else {
 625         return map _varCE($vbl, $_), $der->($u);
 626     }
 627 }
 628
 629
 630 ##
 631 ## string sortkey = getSortKey(string arg)
 632 ##
 633 sub getSortKey
 634 {
 635     my $self = shift;
 636     my $lev  = $self->{level};
 637     my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
 638     my $ver9 = $self->{UCA_Version} >= 9;
 639     my $v2i  = $ver9 && $self->{variable} ne 'non-ignorable';
 640
 641     my @buf; # weight arrays
 642     if ($self->{hangul_terminator}) {
 643         my $preHST = '';
 644         foreach my $jcps (@$rEnt) {
 645             # weird things like VL, TL-contraction are not considered!
 646             my $curHST = '';
 647             foreach my $u (split /;/, $jcps) {
 648                 $curHST .= getHST($u);
 649             }
 650             if ($preHST && !$curHST || # hangul before non-hangul
 651                 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
 652                 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
 653                 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
 654
 655                 push @buf, $self->getWtHangulTerm();
 656             }
 657             $preHST = $curHST;
 658
 659             push @buf, $self->getWt($jcps);
 660         }
 661         $preHST # end at hangul
 662             and push @buf, $self->getWtHangulTerm();
 663     }
 664     else {
 665         foreach my $jcps (@$rEnt) {
 666             push @buf, $self->getWt($jcps);
 667         }
 668     }
 669
 670     # make sort key
 671     my @ret = ([],[],[],[]);
 672     my $last_is_variable;
 673
 674     foreach my $vwt (@buf) {
 675         my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 676         if ($v2i) {
 677             if ($var) {
 678                 $last_is_variable = TRUE;
 679             }
 680             elsif (!$wt[0]) { # ignorable
 681                 next if $last_is_variable;
 682             }
 683             else {
 684                 $last_is_variable = FALSE;
 685             }
 686         }
 687         foreach my $v (0..$lev-1) {
 688             0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
 689         }
 690     }
 691
 692     # modification of tertiary weights
 693     if ($self->{upper_before_lower}) {
 694         foreach (@{ $ret[2] }) {
 695             if    (0x8 <= $_ && $_ <= 0xC) { $_ -= 6 } # lower
 696             elsif (0x2 <= $_ && $_ <= 0x6) { $_ += 6 } # upper
 697             elsif ($_ == 0x1C)             { $_ += 1 } # square upper
 698             elsif ($_ == 0x1D)             { $_ -= 1 } # square lower
 699         }
 700     }
 701     if ($self->{katakana_before_hiragana}) {
 702         foreach (@{ $ret[2] }) {
 703             if    (0x0F <= $_ && $_ <= 0x13) { $_ -= 2 } # katakana
 704             elsif (0x0D <= $_ && $_ <= 0x0E) { $_ += 5 } # hiragana
 705         }
 706     }
 707
 708     if ($self->{backwardsFlag}) {
 709         for (my $v = MinLevel; $v <= MaxLevel; $v++) {
 710             if ($self->{backwardsFlag} & (1 << $v)) {
 711                 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
 712             }
 713         }
 714     }
 715
 716     join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
 717 }
 718
 719
 720 ##
 721 ## int compare = cmp(string a, string b)
 722 ##
 723 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 724 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 725 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 726 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 727 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 728 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 729 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 730
 731 ##
 732 ## list[strings] sorted = sort(list[strings] arg)
 733 ##
 734 sub sort {
 735     my $obj = shift;
 736     return
 737         map { $_->[1] }
 738             sort{ $a->[0] cmp $b->[0] }
 739                 map [ $obj->getSortKey($_), $_ ], @_;
 740 }
 741
 742
 743 sub _derivCE_9 {
 744     my $u = shift;
 745     my $base =
 746         (CJK_UidIni  <= $u && $u <= CJK_UidFin)
 747             ? 0xFB40 : # CJK
 748         (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 749          CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 750             ? 0xFB80   # CJK ext.
 751             : 0xFBC0;  # others
 752
 753     my $aaaa = $base + ($u >> 15);
 754     my $bbbb = ($u & 0x7FFF) | 0x8000;
 755     return
 756         pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 757         pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 758 }
 759
 760 sub _derivCE_8 {
 761     my $code = shift;
 762     my $aaaa =  0xFF80 + ($code >> 15);
 763     my $bbbb = ($code & 0x7FFF) | 0x8000;
 764     return
 765         pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
 766         pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
 767 }
 768
 769
 770 sub getWtHangulTerm {
 771     my $self = shift;
 772     return _varCE($self->{variable},
 773         pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
 774 }
 775
 776
 777 ##
 778 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 779 ##
 780 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 781
 782 #
 783 # $code *must* be in Hangul syllable.
 784 # Check it before you enter here.
 785 #
 786 sub _decompHangul {
 787     my $code = shift;
 788     my $SIndex = $code - Hangul_SBase;
 789     my $LIndex = int( $SIndex / Hangul_NCount);
 790     my $VIndex = int(($SIndex % Hangul_NCount) / Hangul_TCount);
 791     my $TIndex =      $SIndex % Hangul_TCount;
 792     return (
 793         Hangul_LBase + $LIndex,
 794         Hangul_VBase + $VIndex,
 795         $TIndex ? (Hangul_TBase + $TIndex) : (),
 796     );
 797 }
 798
 799 sub _isIllegal {
 800     my $code = shift;
 801     return ! defined $code                      # removed
 802         || ($code < 0 || 0x10FFFF < $code)      # out of range
 803         || (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF] (cf. utf8.c)
 804         || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
 805         || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
 806     ;
 807 }
 808
 809 # Hangul Syllable Type
 810 sub getHST {
 811     my $u = shift;
 812     return
 813         Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
 814         Hangul_VIni <= $u && $u <= Hangul_VFin       ? "V" :
 815         Hangul_TIni <= $u && $u <= Hangul_TFin       ? "T" :
 816         Hangul_SIni <= $u && $u <= Hangul_SFin ?
 817             ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
 818 }
 819
 820
 821 ##
 822 ## bool _nonIgnorAtLevel(arrayref weights, int level)
 823 ##
 824 sub _nonIgnorAtLevel($$)
 825 {
 826     my $wt = shift;
 827     return if ! defined $wt;
 828     my $lv = shift;
 829     return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
 830 }
 831
 832 ##
 833 ## bool _eqArray(
 834 ##    arrayref of arrayref[weights] source,
 835 ##    arrayref of arrayref[weights] substr,
 836 ##    int level)
 837 ## * comparison of graphemes vs graphemes.
 838 ##   @$source >= @$substr must be true (check it before call this);
 839 ##
 840 sub _eqArray($$$)
 841 {
 842     my $source = shift;
 843     my $substr = shift;
 844     my $lev = shift;
 845
 846     for my $g (0..@$substr-1){
 847         # Do the $g'th graphemes have the same number of AV weigths?
 848         return if @{ $source->[$g] } != @{ $substr->[$g] };
 849
 850         for my $w (0..@{ $substr->[$g] }-1) {
 851             for my $v (0..$lev-1) {
 852                 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
 853             }
 854         }
 855     }
 856     return 1;
 857 }
 858
 859 ##
 860 ## (int position, int length)
 861 ## int position = index(string, substring, position, [undoc'ed grobal])
 862 ##
 863 ## With "grobal" (only for the list context),
 864 ##  returns list of arrayref[position, length].
 865 ##
 866 sub index
 867 {
 868     my $self = shift;
 869     my $str  = shift;
 870     my $len  = length($str);
 871     my $subE = $self->splitEnt(shift);
 872     my $pos  = @_ ? shift : 0;
 873        $pos  = 0 if $pos < 0;
 874     my $grob = shift;
 875
 876     my $lev  = $self->{level};
 877     my $ver9 = $self->{UCA_Version} >= 9;
 878     my $v2i  = $self->{variable} ne 'non-ignorable';
 879
 880     if (! @$subE) {
 881         my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
 882         return $grob
 883             ? map([$_, 0], $temp..$len)
 884             : wantarray ? ($temp,0) : $temp;
 885     }
 886     $len < $pos
 887         and return wantarray ? () : NOMATCHPOS;
 888     my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
 889     @$strE
 890         or return wantarray ? () : NOMATCHPOS;
 891
 892     my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
 893
 894     my $last_is_variable;
 895     for my $vwt (map $self->getWt($_), @$subE) {
 896         my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 897         my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
 898
 899         if ($v2i && $ver9) {
 900             if ($var) {
 901                 $last_is_variable = TRUE;
 902             }
 903             elsif (!$wt[0]) { # ignorable
 904                 $to_be_pushed = FALSE if $last_is_variable;
 905             }
 906             else {
 907                 $last_is_variable = FALSE;
 908             }
 909         }
 910
 911         if (@subWt && !$var && !$wt[0]) {
 912             push @{ $subWt[-1] }, \@wt if $to_be_pushed;
 913         } else {
 914             push @subWt, [ \@wt ];
 915         }
 916     }
 917
 918     my $count = 0;
 919     my $end = @$strE - 1;
 920
 921     $last_is_variable = FALSE; # reuse
 922     for (my $i = 0; $i <= $end; ) { # no $i++
 923         my $found_base = 0;
 924
 925         # fetch a grapheme
 926         while ($i <= $end && $found_base == 0) {
 927             for my $vwt ($self->getWt($strE->[$i][0])) {
 928                 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 929                 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
 930
 931                 if ($v2i && $ver9) {
 932                     if ($var) {
 933                         $last_is_variable = TRUE;
 934                     }
 935                     elsif (!$wt[0]) { # ignorable
 936                         $to_be_pushed = FALSE if $last_is_variable;
 937                     }
 938                     else {
 939                         $last_is_variable = FALSE;
 940                     }
 941                 }
 942
 943                 if (@strWt && !$var && !$wt[0]) {
 944                     push @{ $strWt[-1] }, \@wt if $to_be_pushed;
 945                     $finPos[-1] = $strE->[$i][2];
 946                 } elsif ($to_be_pushed) {
 947                     push @strWt, [ \@wt ];
 948                     push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
 949                     $finPos[-1] = NOMATCHPOS if $found_base;
 950                     push @finPos, $strE->[$i][2];
 951                     $found_base++;
 952                 }
 953                 # else ===> no-op
 954             }
 955             $i++;
 956         }
 957
 958         # try to match
 959         while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
 960             if ($iniPos[0] != NOMATCHPOS &&
 961                     $finPos[$#subWt] != NOMATCHPOS &&
 962                         _eqArray(\@strWt, \@subWt, $lev)) {
 963                 my $temp = $iniPos[0] + $pos;
 964
 965                 if ($grob) {
 966                     push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
 967                     splice @strWt,  0, $#subWt;
 968                     splice @iniPos, 0, $#subWt;
 969                     splice @finPos, 0, $#subWt;
 970                 }
 971                 else {
 972                     return wantarray
 973                         ? ($temp, $finPos[$#subWt] - $iniPos[0])
 974                         :  $temp;
 975                 }
 976             }
 977             shift @strWt;
 978             shift @iniPos;
 979             shift @finPos;
 980         }
 981     }
 982
 983     return $grob
 984         ? @g_ret
 985         : wantarray ? () : NOMATCHPOS;
 986 }
 987
 988 ##
 989 ## scalarref to matching part = match(string, substring)
 990 ##
 991 sub match
 992 {
 993     my $self = shift;
 994     if (my($pos,$len) = $self->index($_[0], $_[1])) {
 995         my $temp = substr($_[0], $pos, $len);
 996         return wantarray ? $temp : \$temp;
 997         # An lvalue ref \substr should be avoided,
 998         # since its value is affected by modification of its referent.
 999     }
1000     else {
1001         return;
1002     }
1003 }
1004
1005 ##
1006 ## arrayref matching parts = gmatch(string, substring)
1007 ##
1008 sub gmatch
1009 {
1010     my $self = shift;
1011     my $str  = shift;
1012     my $sub  = shift;
1013     return map substr($str, $_->[0], $_->[1]),
1014                 $self->index($str, $sub, 0, 'g');
1015 }
1016
1017 ##
1018 ## bool subst'ed = subst(string, substring, replace)
1019 ##
1020 sub subst
1021 {
1022     my $self = shift;
1023     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1024
1025     if (my($pos,$len) = $self->index($_[0], $_[1])) {
1026         if ($code) {
1027             my $mat = substr($_[0], $pos, $len);
1028             substr($_[0], $pos, $len, $code->($mat));
1029         } else {
1030             substr($_[0], $pos, $len, $_[2]);
1031         }
1032         return TRUE;
1033     }
1034     else {
1035         return FALSE;
1036     }
1037 }
1038
1039 ##
1040 ## int count = gsubst(string, substring, replace)
1041 ##
1042 sub gsubst
1043 {
1044     my $self = shift;
1045     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1046     my $cnt = 0;
1047
1048     # Replacement is carried out from the end, then use reverse.
1049     for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1050         if ($code) {
1051             my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1052             substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1053         } else {
1054             substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1055         }
1056         $cnt++;
1057     }
1058     return $cnt;
1059 }
1060
1061 1;
1062 __END__
1063
1064 =head1 NAME
1065
1066 Unicode::Collate - Unicode Collation Algorithm
1067
1068 =head1 SYNOPSIS
1069
1070   use Unicode::Collate;
1071
1072   #construct
1073   $Collator = Unicode::Collate->new(%tailoring);
1074
1075   #sort
1076   @sorted = $Collator->sort(@not_sorted);
1077
1078   #compare
1079   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1080
1081   # If %tailoring is false (i.e. empty),
1082   # $Collator should do the default collation.
1083
1084 =head1 DESCRIPTION
1085
1086 This module is an implementation
1087 of Unicode Technical Standard #10 (UTS #10)
1088 "Unicode Collation Algorithm."
1089
1090 =head2 Constructor and Tailoring
1091
1092 The C<new> method returns a collator object.
1093
1094    $Collator = Unicode::Collate->new(
1095       UCA_Version => $UCA_Version,
1096       alternate => $alternate, # deprecated: use of 'variable' is recommended.
1097       backwards => $levelNumber, # or \@levelNumbers
1098       entry => $element,
1099       hangul_terminator => $term_primary_weight,
1100       ignoreName => qr/$ignoreName/,
1101       ignoreChar => qr/$ignoreChar/,
1102       katakana_before_hiragana => $bool,
1103       level => $collationLevel,
1104       normalization  => $normalization_form,
1105       overrideCJK => \&overrideCJK,
1106       overrideHangul => \&overrideHangul,
1107       preprocess => \&preprocess,
1108       rearrange => \@charList,
1109       table => $filename,
1110       undefName => qr/$undefName/,
1111       undefChar => qr/$undefChar/,
1112       upper_before_lower => $bool,
1113       variable => $variable,
1114    );
1115
1116 =over 4
1117
1118 =item UCA_Version
1119
1120 If the tracking version number of the older UCA is given,
1121 the older behavior of that tracking version is emulated on collating.
1122 If omitted, the return value of C<UCA_Version()> is used.
1123
1124 The supported tracking version: 8, 9, or 11.
1125
1126 B<This parameter may be removed in the future version,
1127 as switching the algorithm would affect the performance.>
1128
1129 =item backwards
1130
1131 -- see 3.1.2 French Accents, UTS #10.
1132
1133      backwards => $levelNumber or \@levelNumbers
1134
1135 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1136 If omitted, forwards at all the levels.
1137
1138 =item entry
1139
1140 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1141
1142 If the same character (or a sequence of characters) exists
1143 in the collation element table through C<table>,
1144 mapping to collation elements is overrided.
1145 If it does not exist, the mapping is defined additionally.
1146
1147     entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1148 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1149 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1150 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1151 006C 006C ; [.0F4C.0020.0002.006C] # ll
1152 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1153 004C 004C ; [.0F4C.0020.0008.004C] # LL
1154 00F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1155 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1156 00D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1157 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1158 ENTRY
1159
1160     entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1161 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1162 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1163 ENTRY
1164
1165 B<NOTE:> The code point in the UCA file format (before C<';'>)
1166 B<must> be a Unicode code point (defined as hexadecimal),
1167 but not a native code point.
1168 So C<0063> must always denote C<U+0063>,
1169 but not a character of C<"\x63">.
1170
1171 Weighting may vary depending on collation element table.
1172 So ensure the weights defined in C<entry> will be consistent with
1173 those in the collation element table loaded via C<table>.
1174
1175 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1176 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1177 (as a value between C<0E60> and C<0E6D>)
1178 makes ordering as C<C E<lt> CH E<lt> D>.
1179 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1180 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1181 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1182 and C<c-curl> (C<U+0255>) with C<0E69>.
1183 Then primary weight C<0E6A> for C<CH> makes C<CH>
1184 ordered between C<c-curl> and C<D>.
1185
1186 =item hangul_terminator
1187
1188 -- see Condition B.2. in 7.1.4 Trailing Weights, UTS #10.
1189
1190 If a true value is given (non-zero but should be positive),
1191 it will be added as a terminator primary weight to the end of
1192 every standard Hangul syllable. Secondary and any higher weights
1193 for terminator are set to zero.
1194 If the value is false or C<hangul_terminator> key does not exist,
1195 insertion of terminator weights will not be performed.
1196
1197 Boundaries of Hangul syllables are determined
1198 according to conjoining Jamo behavior in F<the Unicode Standard>
1199 and F<HangulSyllableType.txt>.
1200
1201 B<Implementation Note:>
1202 (1) For expansion mapping (Unicode character mapped
1203 to a sequence of collation elements), a terminator will not be added
1204 between collation elements, even if Hangul syllable boundary exists there.
1205 Addition of terminator is restricted to the next position
1206 to the last collation element.
1207
1208 (2) Non-conjoining Hangul letters
1209 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1210 automatically terminated with a terminator primary weight.
1211 These characters may need terminator included in a collation element
1212 table beforehand.
1213
1214 =item ignoreName
1215
1216 =item ignoreChar
1217
1218 -- see Completely Ignorable, 3.2.2 Variable Weighting, UTS #10.
1219
1220 Makes the entry in the table completely ignorable;
1221 i.e. as if the weights were zero at all level.
1222
1223 E.g. when 'a' and 'e' are ignorable,
1224 'element' is equal to 'lament' (or 'lmnt').
1225
1226 =item level
1227
1228 -- see 4.3 Form a sort key for each string, UTS #10.
1229
1230 Set the maximum level.
1231 Any higher levels than the specified one are ignored.
1232
1233   Level 1: alphabetic ordering
1234   Level 2: diacritic ordering
1235   Level 3: case ordering
1236   Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1237
1238   ex.level => 2,
1239
1240 If omitted, the maximum is the 4th.
1241
1242 =item normalization
1243
1244 -- see 4.1 Normalize each input string, UTS #10.
1245
1246 If specified, strings are normalized before preparation of sort keys
1247 (the normalization is executed after preprocess).
1248
1249 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1250 as C<$normalization_form>.
1251 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1252 See C<Unicode::Normalize::normalize()> for detail.
1253 If omitted, C<'NFD'> is used.
1254
1255 C<normalization> is performed after C<preprocess> (if defined).
1256
1257 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1258 though they are not concerned with C<Unicode::Normalize::normalize()>.
1259
1260 If C<undef> (not a string C<"undef">) is passed explicitly
1261 as the value for this key,
1262 any normalization is not carried out (this may make tailoring easier
1263 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1264 only contiguous contractions are resolved;
1265 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1266 C<A-cedilla-ring> would be primary equal to C<A>.
1267 In this point,
1268 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1269 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1270
1271 In the case of C<(normalization =E<gt> "prenormalized")>,
1272 any normalization is not performed, but
1273 non-contiguous contractions with combining characters are performed.
1274 Therefore
1275 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1276 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1277 If source strings are finely prenormalized,
1278 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1279
1280 Except C<(normalization =E<gt> undef)>,
1281 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1282
1283 =item overrideCJK
1284
1285 -- see 7.1 Derived Collation Elements, UTS #10.
1286
1287 By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1288 (but C<CJK Unified Ideographs> [C<U+4E00> to C<U+9FA5>]  are lesser than
1289 C<CJK Unified Ideographs Extension> [C<U+3400> to C<U+4DB5> and
1290 C<U+20000> to C<U+2A6D6>].
1291
1292 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1293
1294 ex. CJK Unified Ideographs in the JIS code point order.
1295
1296   overrideCJK => sub {
1297       my $u = shift;             # get a Unicode codepoint
1298       my $b = pack('n', $u);     # to UTF-16BE
1299       my $s = your_unicode_to_sjis_converter($b); # convert
1300       my $n = unpack('n', $s);   # convert sjis to short
1301       [ $n, 0x20, 0x2, $u ];     # return the collation element
1302   },
1303
1304 ex. ignores all CJK Unified Ideographs.
1305
1306   overrideCJK => sub {()}, # CODEREF returning empty list
1307
1308    # where ->eq("Pe\x{4E00}rl", "Perl") is true
1309    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1310
1311 If C<undef> is passed explicitly as the value for this key,
1312 weights for CJK Unified Ideographs are treated as undefined.
1313 But assignment of weight for CJK Unified Ideographs
1314 in table or C<entry> is still valid.
1315
1316 =item overrideHangul
1317
1318 -- see 7.1 Derived Collation Elements, UTS #10.
1319
1320 By default, Hangul Syllables are decomposed into Hangul Jamo,
1321 even if C<(normalization =E<gt> undef)>.
1322 But the mapping of Hangul Syllables may be overrided.
1323
1324 This tag works like C<overrideCJK>, so see there for examples.
1325
1326 If you want to override the mapping of Hangul Syllables,
1327 NFD, NFKD, and FCD are not appropriate,
1328 since they will decompose Hangul Syllables before overriding.
1329
1330 If C<undef> is passed explicitly as the value for this key,
1331 weight for Hangul Syllables is treated as undefined
1332 without decomposition into Hangul Jamo.
1333 But definition of weight for Hangul Syllables
1334 in table or C<entry> is still valid.
1335
1336 =item preprocess
1337
1338 -- see 5.1 Preprocessing, UTS #10.
1339
1340 If specified, the coderef is used to preprocess
1341 before the formation of sort keys.
1342
1343 ex. dropping English articles, such as "a" or "the".
1344 Then, "the pen" is before "a pencil".
1345
1346      preprocess => sub {
1347            my $str = shift;
1348            $str =~ s/\b(?:an?|the)\s+//gi;
1349            return $str;
1350         },
1351
1352 C<preprocess> is performed before C<normalization> (if defined).
1353
1354 =item rearrange
1355
1356 -- see 3.1.3 Rearrangement, UTS #10.
1357
1358 Characters that are not coded in logical order and to be rearranged.
1359 By default,
1360
1361     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1362
1363 If you want to disallow any rearrangement,
1364 pass C<undef> or C<[]> (a reference to an empty list)
1365 as the value for this key.
1366
1367 B<According to the version 9 of UCA, this parameter shall not be used;
1368 but it is not warned at present.>
1369
1370 =item table
1371
1372 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1373
1374 You can use another collation element table if desired.
1375
1376 The table file should locate in the F<Unicode/Collate> directory
1377 on C<@INC>. Say, if the filename is F<Foo.txt>
1378 the table file is searched as F<Unicode/Collate/Foo.txt> in <@INC>.
1379
1380 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1381
1382 If C<undef> is passed explicitly as the value for this key,
1383 no file is read (but you can define collation elements via C<entry>).
1384
1385 A typical way to define a collation element table
1386 without any file of table:
1387
1388    $onlyABC = Unicode::Collate->new(
1389        table => undef,
1390        entry => << 'ENTRIES',
1391 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1392 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1393 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1394 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1395 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1396 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1397 ENTRIES
1398     );
1399
1400 =item undefName
1401
1402 =item undefChar
1403
1404 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1405
1406 Undefines the collation element as if it were unassigned in the table.
1407 This reduces the size of the table.
1408 If an unassigned character appears in the string to be collated,
1409 the sort key is made from its codepoint
1410 as a single-character collation element,
1411 as it is greater than any other assigned collation elements
1412 (in the codepoint order among the unassigned characters).
1413 But, it'd be better to ignore characters
1414 unfamiliar to you and maybe never used.
1415
1416 ex. Collation weights for beyond-BMP characters are not stored in object:
1417
1418     undefChar => qr/[^\0-\x{fffd}]/,
1419
1420 =item katakana_before_hiragana
1421
1422 =item upper_before_lower
1423
1424 -- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTS #10.
1425
1426 By default, lowercase is before uppercase
1427 and hiragana is before katakana.
1428
1429 If the tag is made true, this is reversed.
1430
1431 B<NOTE>: These tags simplemindedly assume
1432 any lowercase/uppercase or hiragana/katakana distinctions
1433 must occur in level 3, and their weights at level 3
1434 must be same as those mentioned in 7.3.1, UTS #10.
1435 If you define your collation elements which violate this requirement,
1436 these tags don't work validly.
1437
1438 =item variable
1439
1440 =item alternate
1441
1442 -- see 3.2.2 Variable Weighting, UTS #10.
1443
1444 (the title in UCA version 8: Alternate Weighting)
1445
1446 This key allows to variable weighting for variable collation elements,
1447 which are marked with an ASTERISK in the table
1448 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1449
1450    variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1451
1452 These names are case-insensitive.
1453 By default (if specification is omitted), 'shifted' is adopted.
1454
1455    'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1456                     considered at the 4th level.
1457
1458    'Non-Ignorable'  Variable elements are not reset to ignorable.
1459
1460    'Shifted'        Variable elements are made ignorable at levels 1 through 3
1461                     their level 4 weight is replaced by the old level 1 weight.
1462                     Level 4 weight for Non-Variable elements is 0xFFFF.
1463
1464    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1465                     are trimmed.
1466
1467 For backward compatibility, C<alternate> can be used as an alias
1468 for C<variable>.
1469
1470 =back
1471
1472 =head2 Methods for Collation
1473
1474 =over 4
1475
1476 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1477
1478 Sorts a list of strings.
1479
1480 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1481
1482 Returns 1 (when C<$a> is greater than C<$b>)
1483 or 0 (when C<$a> is equal to C<$b>)
1484 or -1 (when C<$a> is lesser than C<$b>).
1485
1486 =item C<$result = $Collator-E<gt>eq($a, $b)>
1487
1488 =item C<$result = $Collator-E<gt>ne($a, $b)>
1489
1490 =item C<$result = $Collator-E<gt>lt($a, $b)>
1491
1492 =item C<$result = $Collator-E<gt>le($a, $b)>
1493
1494 =item C<$result = $Collator-E<gt>gt($a, $b)>
1495
1496 =item C<$result = $Collator-E<gt>ge($a, $b)>
1497
1498 They works like the same name operators as theirs.
1499
1500    eq : whether $a is equal to $b.
1501    ne : whether $a is not equal to $b.
1502    lt : whether $a is lesser than $b.
1503    le : whether $a is lesser than $b or equal to $b.
1504    gt : whether $a is greater than $b.
1505    ge : whether $a is greater than $b or equal to $b.
1506
1507 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1508
1509 -- see 4.3 Form a sort key for each string, UTS #10.
1510
1511 Returns a sort key.
1512
1513 You compare the sort keys using a binary comparison
1514 and get the result of the comparison of the strings using UCA.
1515
1516    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1517
1518       is equivalent to
1519
1520    $Collator->cmp($a, $b)
1521
1522 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1523
1524    use Unicode::Collate;
1525    my $c = Unicode::Collate->new();
1526    print $c->viewSortKey("Perl"),"\n";
1527
1528    # output:
1529    # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1530    #  Level 1               Level 2               Level 3               Level 4
1531
1532     (If C<UCA_Version> is 8, the output is slightly different.)
1533
1534 =back
1535
1536 =head2 Methods for Searching
1537
1538 B<DISCLAIMER:> If C<preprocess> or C<normalization> tag is true
1539 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1540 C<subst>, C<gsubst>) is croaked,
1541 as the position and the length might differ
1542 from those on the specified string.
1543 (And C<rearrange> and C<hangul_terminator> tags are neglected.)
1544
1545 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1546 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1547 but they are not aware of any pattern, but only a literal substring.
1548
1549 =over 4
1550
1551 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1552
1553 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1554
1555 If C<$substring> matches a part of C<$string>, returns
1556 the position of the first occurrence of the matching part in scalar context;
1557 in list context, returns a two-element list of
1558 the position and the length of the matching part.
1559
1560 If C<$substring> does not match any part of C<$string>,
1561 returns C<-1> in scalar context and
1562 an empty list in list context.
1563
1564 e.g. you say
1565
1566   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1567                                      # (normalization => undef) is REQUIRED.
1568   my $str = "Ich muß studieren Perl.";
1569   my $sub = "MÜSS";
1570   my $match;
1571   if (my($pos,$len) = $Collator->index($str, $sub)) {
1572       $match = substr($str, $pos, $len);
1573   }
1574
1575 and get C<"muß"> in C<$match> since C<"muß">
1576 is primary equal to C<"MÜSS">.
1577
1578 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1579
1580 =item C<($match)   = $Collator-E<gt>match($string, $substring)>
1581
1582 If C<$substring> matches a part of C<$string>, in scalar context, returns
1583 B<a reference to> the first occurrence of the matching part
1584 (C<$match_ref> is always true if matches,
1585 since every reference is B<true>);
1586 in list context, returns the first occurrence of the matching part.
1587
1588 If C<$substring> does not match any part of C<$string>,
1589 returns C<undef> in scalar context and
1590 an empty list in list context.
1591
1592 e.g.
1593
1594     if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1595         print "matches [$$match_ref].\n";
1596     } else {
1597         print "doesn't match.\n";
1598     }
1599
1600      or
1601
1602     if (($match) = $Collator->match($str, $sub)) { # list context
1603         print "matches [$match].\n";
1604     } else {
1605         print "doesn't match.\n";
1606     }
1607
1608 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1609
1610 If C<$substring> matches a part of C<$string>, returns
1611 all the matching parts (or matching count in scalar context).
1612
1613 If C<$substring> does not match any part of C<$string>,
1614 returns an empty list.
1615
1616 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1617
1618 If C<$substring> matches a part of C<$string>,
1619 the first occurrence of the matching part is replaced by C<$replacement>
1620 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1621
1622 C<$replacement> can be a C<CODEREF>,
1623 taking the matching part as an argument,
1624 and returning a string to replace the matching part
1625 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1626
1627 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1628
1629 If C<$substring> matches a part of C<$string>,
1630 all the occurrences of the matching part is replaced by C<$replacement>
1631 (C<$string> is modified) and return C<$count>.
1632
1633 C<$replacement> can be a C<CODEREF>,
1634 taking the matching part as an argument,
1635 and returning a string to replace the matching part
1636 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1637
1638 e.g.
1639
1640   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1641                                      # (normalization => undef) is REQUIRED.
1642   my $str = "Camel ass came\x{301}l CAMEL horse cAm\0E\0L...";
1643   $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1644
1645   # now $str is "<b>Camel</b> ass <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1646   # i.e., all the camels are made bold-faced.
1647
1648 =back
1649
1650 =head2 Other Methods
1651
1652 =over 4
1653
1654 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1655
1656 Change the value of specified keys and returns the changed part.
1657
1658     $Collator = Unicode::Collate->new(level => 4);
1659
1660     $Collator->eq("perl", "PERL"); # false
1661
1662     %old = $Collator->change(level => 2); # returns (level => 4).
1663
1664     $Collator->eq("perl", "PERL"); # true
1665
1666     $Collator->change(%old); # returns (level => 2).
1667
1668     $Collator->eq("perl", "PERL"); # false
1669
1670 Not all C<(key,value)>s are allowed to be changed.
1671 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1672
1673 In the scalar context, returns the modified collator
1674 (but it is B<not> a clone from the original).
1675
1676     $Collator->change(level => 2)->eq("perl", "PERL"); # true
1677
1678     $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1679
1680     $Collator->change(level => 4)->eq("perl", "PERL"); # false
1681
1682 =item C<$version = $Collator-E<gt>version()>
1683
1684 Returns the version number (a string) of the Unicode Standard
1685 which the C<table> file used by the collator object is based on.
1686 If the table does not include a version line (starting with C<@version>),
1687 returns C<"unknown">.
1688
1689 =item C<UCA_Version()>
1690
1691 Returns the tracking version number of UTS #10 this module consults.
1692
1693 =item C<Base_Unicode_Version()>
1694
1695 Returns the version number of UTS #10 this module consults.
1696
1697 =back
1698
1699 =head2 EXPORT
1700
1701 None by default.
1702
1703 =head2 CAVEAT
1704
1705 Use of the C<normalization> parameter requires
1706 the B<Unicode::Normalize> module.
1707
1708 If you need not it (say, in the case when you need not
1709 handle any combining characters),
1710 assign C<normalization =E<gt> undef> explicitly.
1711
1712 -- see 6.5 Avoiding Normalization, UTS #10.
1713
1714 =head2 Conformance Test
1715
1716 The Conformance Test for the UCA is available
1717 under L<http://www.unicode.org/Public/UCA/>.
1718
1719 For F<CollationTest_SHIFTED.txt>,
1720 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1721 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1722 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1723
1724 B<Unicode::Normalize is required to try The Conformance Test.>
1725
1726 =head1 AUTHOR
1727
1728 SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
1729
1730   http://homepage1.nifty.com/nomenclator/perl/
1731
1732   Copyright(C) 2001-2004, SADAHIRO Tomoyuki. Japan. All rights reserved.
1733
1734   This library is free software; you can redistribute it
1735   and/or modify it under the same terms as Perl itself.
1736
1737 =head1 SEE ALSO
1738
1739 =over 4
1740
1741 =item Unicode Collation Algorithm - UTS #10
1742
1743 L<http://www.unicode.org/reports/tr10/>
1744
1745 =item The Default Unicode Collation Element Table (DUCET)
1746
1747 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1748
1749 =item The conformance test for the UCA
1750
1751 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1752
1753 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1754
1755 =item Hangul Syllable Type
1756
1757 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1758
1759 =item Unicode Normalization Forms - UAX #15
1760
1761 L<http://www.unicode.org/reports/tr15/>
1762
1763 =item L<Unicode::Normalize>
1764
1765 =back
1766
1767 =cut