lib/Unicode/Collate.pm

   1 package Unicode::Collate;
   2
   3 BEGIN {
   4     unless ("A" eq pack('U', 0x41)) {
   5         die "Unicode::Collate cannot stringify a Unicode code point\n";
   6     }
   7 }
   8
   9 use 5.006;
  10 use strict;
  11 use warnings;
  12 use Carp;
  13 use File::Spec;
  14
  15 no warnings 'utf8';
  16
  17 our $VERSION = '0.52';
  18 our $PACKAGE = __PACKAGE__;
  19
  20 my @Path = qw(Unicode Collate);
  21 my $KeyFile = "allkeys.txt";
  22
  23 # Perl's boolean
  24 use constant TRUE  => 1;
  25 use constant FALSE => "";
  26 use constant NOMATCHPOS => -1;
  27
  28 # A coderef to get combining class imported from Unicode::Normalize
  29 # (i.e. \&Unicode::Normalize::getCombinClass).
  30 # This is also used as a HAS_UNICODE_NORMALIZE flag.
  31 my $CVgetCombinClass;
  32
  33 # Supported Levels
  34 use constant MinLevel => 1;
  35 use constant MaxLevel => 4;
  36
  37 # Minimum weights at level 2 and 3, respectively
  38 use constant Min2Wt => 0x20;
  39 use constant Min3Wt => 0x02;
  40
  41 # Shifted weight at 4th level
  42 use constant Shift4Wt => 0xFFFF;
  43
  44 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element
  45 # PROBLEM: The Default Unicode Collation Element Table
  46 # has weights over 0xFFFF at the 4th level.
  47 # The tie-breaking in the variable weights
  48 # other than "shift" (as well as "shift-trimmed") is unreliable.
  49 use constant VCE_TEMPLATE => 'Cn4';
  50
  51 # A sort key: 16-bit weights
  52 # See also the PROBLEM on VCE_TEMPLATE above.
  53 use constant KEY_TEMPLATE => 'n*';
  54
  55 # Level separator in a sort key:
  56 # i.e. pack(KEY_TEMPLATE, 0)
  57 use constant LEVEL_SEP => "\0\0";
  58
  59 # As Unicode code point separator for hash keys.
  60 # A joined code point string (denoted by JCPS below)
  61 # like "65;768" is used for internal processing
  62 # instead of Perl's Unicode string like "\x41\x{300}",
  63 # as the native code point is different from the Unicode code point
  64 # on EBCDIC platform.
  65 # This character must not be included in any stringified
  66 # representation of an integer.
  67 use constant CODE_SEP => ';';
  68
  69 # boolean values of variable weights
  70 use constant NON_VAR => 0; # Non-Variable character
  71 use constant VAR     => 1; # Variable character
  72
  73 # specific code points
  74 use constant Hangul_LBase  => 0x1100;
  75 use constant Hangul_LIni   => 0x1100;
  76 use constant Hangul_LFin   => 0x1159;
  77 use constant Hangul_LFill  => 0x115F;
  78 use constant Hangul_VBase  => 0x1161;
  79 use constant Hangul_VIni   => 0x1160; # from Vowel Filler
  80 use constant Hangul_VFin   => 0x11A2;
  81 use constant Hangul_TBase  => 0x11A7; # from "no-final" codepoint
  82 use constant Hangul_TIni   => 0x11A8;
  83 use constant Hangul_TFin   => 0x11F9;
  84 use constant Hangul_TCount => 28;
  85 use constant Hangul_NCount => 588;
  86 use constant Hangul_SBase  => 0xAC00;
  87 use constant Hangul_SIni   => 0xAC00;
  88 use constant Hangul_SFin   => 0xD7A3;
  89 use constant CJK_UidIni    => 0x4E00;
  90 use constant CJK_UidFin    => 0x9FA5;
  91 use constant CJK_UidF41    => 0x9FBB;
  92 use constant CJK_ExtAIni   => 0x3400;
  93 use constant CJK_ExtAFin   => 0x4DB5;
  94 use constant CJK_ExtBIni   => 0x20000;
  95 use constant CJK_ExtBFin   => 0x2A6D6;
  96 use constant BMP_Max       => 0xFFFF;
  97
  98 # Logical_Order_Exception in PropList.txt
  99 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
 100
 101 sub UCA_Version { "14" }
 102
 103 sub Base_Unicode_Version { "4.1.0" }
 104
 105 ######
 106
 107 sub pack_U {
 108     return pack('U*', @_);
 109 }
 110
 111 sub unpack_U {
 112     return unpack('U*', shift(@_).pack('U*'));
 113 }
 114
 115 ######
 116
 117 my (%VariableOK);
 118 @VariableOK{ qw/
 119     blanked  non-ignorable  shifted  shift-trimmed
 120   / } = (); # keys lowercased
 121
 122 our @ChangeOK = qw/
 123     alternate backwards level normalization rearrange
 124     katakana_before_hiragana upper_before_lower
 125     overrideHangul overrideCJK preprocess UCA_Version
 126     hangul_terminator variable
 127   /;
 128
 129 our @ChangeNG = qw/
 130     entry mapping table maxlength
 131     ignoreChar ignoreName undefChar undefName variableTable
 132     versionTable alternateTable backwardsTable forwardsTable rearrangeTable
 133     derivCode normCode rearrangeHash
 134     backwardsFlag
 135   /;
 136 # The hash key 'ignored' is deleted at v 0.21.
 137 # The hash key 'isShift' is deleted at v 0.23.
 138 # The hash key 'combining' is deleted at v 0.24.
 139 # The hash key 'entries' is deleted at v 0.30.
 140 # The hash key 'L3_ignorable' is deleted at v 0.40.
 141
 142 sub version {
 143     my $self = shift;
 144     return $self->{versionTable} || 'unknown';
 145 }
 146
 147 my (%ChangeOK, %ChangeNG);
 148 @ChangeOK{ @ChangeOK } = ();
 149 @ChangeNG{ @ChangeNG } = ();
 150
 151 sub change {
 152     my $self = shift;
 153     my %hash = @_;
 154     my %old;
 155     if (exists $hash{variable} && exists $hash{alternate}) {
 156         delete $hash{alternate};
 157     }
 158     elsif (!exists $hash{variable} && exists $hash{alternate}) {
 159         $hash{variable} = $hash{alternate};
 160     }
 161     foreach my $k (keys %hash) {
 162         if (exists $ChangeOK{$k}) {
 163             $old{$k} = $self->{$k};
 164             $self->{$k} = $hash{$k};
 165         }
 166         elsif (exists $ChangeNG{$k}) {
 167             croak "change of $k via change() is not allowed!";
 168         }
 169         # else => ignored
 170     }
 171     $self->checkCollator();
 172     return wantarray ? %old : $self;
 173 }
 174
 175 sub _checkLevel {
 176     my $level = shift;
 177     my $key   = shift; # 'level' or 'backwards'
 178     MinLevel <= $level or croak sprintf
 179         "Illegal level %d (in value for key '%s') lower than %d.",
 180             $level, $key, MinLevel;
 181     $level <= MaxLevel or croak sprintf
 182         "Unsupported level %d (in value for key '%s') higher than %d.",
 183             $level, $key, MaxLevel;
 184 }
 185
 186 my %DerivCode = (
 187     8 => \&_derivCE_8,
 188     9 => \&_derivCE_9,
 189    11 => \&_derivCE_9, # 11 == 9
 190    14 => \&_derivCE_14,
 191 );
 192
 193 sub checkCollator {
 194     my $self = shift;
 195     _checkLevel($self->{level}, "level");
 196
 197     $self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
 198         or croak "Illegal UCA version (passed $self->{UCA_Version}).";
 199
 200     $self->{variable} ||= $self->{alternate} || $self->{variableTable} ||
 201                                 $self->{alternateTable} || 'shifted';
 202     $self->{variable} = $self->{alternate} = lc($self->{variable});
 203     exists $VariableOK{ $self->{variable} }
 204         or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
 205
 206     if (! defined $self->{backwards}) {
 207         $self->{backwardsFlag} = 0;
 208     }
 209     elsif (! ref $self->{backwards}) {
 210         _checkLevel($self->{backwards}, "backwards");
 211         $self->{backwardsFlag} = 1 << $self->{backwards};
 212     }
 213     else {
 214         my %level;
 215         $self->{backwardsFlag} = 0;
 216         for my $b (@{ $self->{backwards} }) {
 217             _checkLevel($b, "backwards");
 218             $level{$b} = 1;
 219         }
 220         for my $v (sort keys %level) {
 221             $self->{backwardsFlag} += 1 << $v;
 222         }
 223     }
 224
 225     defined $self->{rearrange} or $self->{rearrange} = [];
 226     ref $self->{rearrange}
 227         or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
 228
 229     # keys of $self->{rearrangeHash} are $self->{rearrange}.
 230     $self->{rearrangeHash} = undef;
 231
 232     if (@{ $self->{rearrange} }) {
 233         @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
 234     }
 235
 236     $self->{normCode} = undef;
 237
 238     if (defined $self->{normalization}) {
 239         eval { require Unicode::Normalize };
 240         $@ and croak "Unicode::Normalize is required to normalize strings";
 241
 242         $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass;
 243
 244         if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
 245             $self->{normCode} = \&Unicode::Normalize::NFD;
 246         }
 247         elsif ($self->{normalization} ne 'prenormalized') {
 248             my $norm = $self->{normalization};
 249             $self->{normCode} = sub {
 250                 Unicode::Normalize::normalize($norm, shift);
 251             };
 252             eval { $self->{normCode}->("") }; # try
 253             $@ and croak "$PACKAGE unknown normalization form name: $norm";
 254         }
 255     }
 256     return;
 257 }
 258
 259 sub new
 260 {
 261     my $class = shift;
 262     my $self = bless { @_ }, $class;
 263
 264     # If undef is passed explicitly, no file is read.
 265     $self->{table} = $KeyFile if ! exists $self->{table};
 266     $self->read_table() if defined $self->{table};
 267
 268     if ($self->{entry}) {
 269         while ($self->{entry} =~ /([^\n]+)/g) {
 270             $self->parseEntry($1);
 271         }
 272     }
 273
 274     $self->{level} ||= MaxLevel;
 275     $self->{UCA_Version} ||= UCA_Version();
 276
 277     $self->{overrideHangul} = FALSE
 278         if ! exists $self->{overrideHangul};
 279     $self->{overrideCJK} = FALSE
 280         if ! exists $self->{overrideCJK};
 281     $self->{normalization} = 'NFD'
 282         if ! exists $self->{normalization};
 283     $self->{rearrange} = $self->{rearrangeTable} ||
 284         ($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
 285         if ! exists $self->{rearrange};
 286     $self->{backwards} = $self->{backwardsTable}
 287         if ! exists $self->{backwards};
 288
 289     $self->checkCollator();
 290
 291     return $self;
 292 }
 293
 294 sub read_table {
 295     my $self = shift;
 296
 297     my($f, $fh);
 298     foreach my $d (@INC) {
 299         $f = File::Spec->catfile($d, @Path, $self->{table});
 300         last if open($fh, $f);
 301         $f = undef;
 302     }
 303     if (!defined $f) {
 304         $f = File::Spec->catfile(@Path, $self->{table});
 305         croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
 306     }
 307
 308     while (my $line = <$fh>) {
 309         next if $line =~ /^\s*#/;
 310         unless ($line =~ s/^\s*\@//) {
 311             $self->parseEntry($line);
 312             next;
 313         }
 314
 315         # matched ^\s*\@
 316         if ($line =~ /^version\s*(\S*)/) {
 317             $self->{versionTable} ||= $1;
 318         }
 319         elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
 320             $self->{variableTable} ||= $1;
 321         }
 322         elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
 323             $self->{alternateTable} ||= $1;
 324         }
 325         elsif ($line =~ /^backwards\s+(\S*)/) {
 326             push @{ $self->{backwardsTable} }, $1;
 327         }
 328         elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use
 329             push @{ $self->{forwardsTable} }, $1;
 330         }
 331         elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG
 332             push @{ $self->{rearrangeTable} }, _getHexArray($1);
 333         }
 334     }
 335     close $fh;
 336 }
 337
 338
 339 ##
 340 ## get $line, parse it, and write an entry in $self
 341 ##
 342 sub parseEntry
 343 {
 344     my $self = shift;
 345     my $line = shift;
 346     my($name, $entry, @uv, @key);
 347
 348     return if $line !~ /^\s*[0-9A-Fa-f]/;
 349
 350     # removes comment and gets name
 351     $name = $1
 352         if $line =~ s/[#%]\s*(.*)//;
 353     return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
 354
 355     # gets element
 356     my($e, $k) = split /;/, $line;
 357     croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
 358         if ! $k;
 359
 360     @uv = _getHexArray($e);
 361     return if !@uv;
 362
 363     $entry = join(CODE_SEP, @uv); # in JCPS
 364
 365     if (defined $self->{undefChar} || defined $self->{ignoreChar}) {
 366         my $ele = pack_U(@uv);
 367
 368         # regarded as if it were not entried in the table
 369         return
 370             if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
 371
 372         # replaced as completely ignorable
 373         $k = '[.0000.0000.0000.0000]'
 374             if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
 375     }
 376
 377     # replaced as completely ignorable
 378     $k = '[.0000.0000.0000.0000]'
 379         if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
 380
 381     my $is_L3_ignorable = TRUE;
 382
 383     foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
 384         my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient.
 385         my @wt = _getHexArray($arr);
 386         push @key, pack(VCE_TEMPLATE, $var, @wt);
 387         $is_L3_ignorable = FALSE
 388             if $wt[0] || $wt[1] || $wt[2];
 389         # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
 390         # is completely ignorable.
 391         # For expansion, an entry $is_L3_ignorable
 392         # if and only if "all" CEs are [.0000.0000.0000].
 393     }
 394
 395     $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
 396
 397     if (@uv > 1) {
 398         (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv)
 399             and $self->{maxlength}{$uv[0]} = @uv;
 400     }
 401 }
 402
 403
 404 ##
 405 ## VCE = _varCE(variable term, VCE)
 406 ##
 407 sub _varCE
 408 {
 409     my $vbl = shift;
 410     my $vce = shift;
 411     if ($vbl eq 'non-ignorable') {
 412         return $vce;
 413     }
 414     my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
 415
 416     if ($var) {
 417         return pack(VCE_TEMPLATE, $var, 0, 0, 0,
 418                 $vbl eq 'blanked' ? $wt[3] : $wt[0]);
 419     }
 420     elsif ($vbl eq 'blanked') {
 421         return $vce;
 422     }
 423     else {
 424         return pack(VCE_TEMPLATE, $var, @wt[0..2],
 425             $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
 426     }
 427 }
 428
 429 sub viewSortKey
 430 {
 431     my $self = shift;
 432     $self->visualizeSortKey($self->getSortKey(@_));
 433 }
 434
 435 sub visualizeSortKey
 436 {
 437     my $self = shift;
 438     my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
 439
 440     if ($self->{UCA_Version} <= 8) {
 441         $view =~ s/ ?0000 ?/|/g;
 442     } else {
 443         $view =~ s/\b0000\b/|/g;
 444     }
 445     return "[$view]";
 446 }
 447
 448
 449 ##
 450 ## arrayref of JCPS   = splitEnt(string to be collated)
 451 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true)
 452 ##
 453 sub splitEnt
 454 {
 455     my $self = shift;
 456     my $wLen = $_[1];
 457
 458     my $code = $self->{preprocess};
 459     my $norm = $self->{normCode};
 460     my $map  = $self->{mapping};
 461     my $max  = $self->{maxlength};
 462     my $reH  = $self->{rearrangeHash};
 463     my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11;
 464
 465     my ($str, @buf);
 466
 467     if ($wLen) {
 468         $code and croak "Preprocess breaks character positions. "
 469                         . "Don't use with index(), match(), etc.";
 470         $norm and croak "Normalization breaks character positions. "
 471                         . "Don't use with index(), match(), etc.";
 472         $str = $_[0];
 473     }
 474     else {
 475         $str = $_[0];
 476         $str = &$code($str) if ref $code;
 477         $str = &$norm($str) if ref $norm;
 478     }
 479
 480     # get array of Unicode code point of string.
 481     my @src = unpack_U($str);
 482
 483     # rearrangement:
 484     # Character positions are not kept if rearranged,
 485     # then neglected if $wLen is true.
 486     if ($reH && ! $wLen) {
 487         for (my $i = 0; $i < @src; $i++) {
 488             if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
 489                 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
 490                 $i++;
 491             }
 492         }
 493     }
 494
 495     # remove a code point marked as a completely ignorable.
 496     for (my $i = 0; $i < @src; $i++) {
 497         $src[$i] = undef
 498             if _isIllegal($src[$i]) || ($ver9 &&
 499                 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0);
 500     }
 501
 502     for (my $i = 0; $i < @src; $i++) {
 503         my $jcps = $src[$i];
 504
 505         # skip removed code point
 506         if (! defined $jcps) {
 507             if ($wLen && @buf) {
 508                 $buf[-1][2] = $i + 1;
 509             }
 510             next;
 511         }
 512
 513         my $i_orig = $i;
 514
 515         # find contraction
 516         if ($max->{$jcps}) {
 517             my $temp_jcps = $jcps;
 518             my $jcpsLen = 1;
 519             my $maxLen = $max->{$jcps};
 520
 521             for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
 522                 next if ! defined $src[$p];
 523                 $temp_jcps .= CODE_SEP . $src[$p];
 524                 $jcpsLen++;
 525                 if ($map->{$temp_jcps}) {
 526                     $jcps = $temp_jcps;
 527                     $i = $p;
 528                 }
 529             }
 530
 531         # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1).
 532         # This process requires Unicode::Normalize.
 533         # If "normalization" is undef, here should be skipped *always*
 534         # (in spite of bool value of $CVgetCombinClass),
 535         # since canonical ordering cannot be expected.
 536         # Blocked combining character should not be contracted.
 537
 538             if ($self->{normalization})
 539             # $self->{normCode} is false in the case of "prenormalized".
 540             {
 541                 my $preCC = 0;
 542                 my $curCC = 0;
 543
 544                 for (my $p = $i + 1; $p < @src; $p++) {
 545                     next if ! defined $src[$p];
 546                     $curCC = $CVgetCombinClass->($src[$p]);
 547                     last unless $curCC;
 548                     my $tail = CODE_SEP . $src[$p];
 549                     if ($preCC != $curCC && $map->{$jcps.$tail}) {
 550                         $jcps .= $tail;
 551                         $src[$p] = undef;
 552                     } else {
 553                         $preCC = $curCC;
 554                     }
 555                 }
 556             }
 557         }
 558
 559         # skip completely ignorable
 560         if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
 561             if ($wLen && @buf) {
 562                 $buf[-1][2] = $i + 1;
 563             }
 564             next;
 565         }
 566
 567         push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
 568     }
 569     return \@buf;
 570 }
 571
 572
 573 ##
 574 ## list of VCE = getWt(JCPS)
 575 ##
 576 sub getWt
 577 {
 578     my $self = shift;
 579     my $u    = shift;
 580     my $vbl  = $self->{variable};
 581     my $map  = $self->{mapping};
 582     my $der  = $self->{derivCode};
 583
 584     return if !defined $u;
 585     return map(_varCE($vbl, $_), @{ $map->{$u} })
 586         if $map->{$u};
 587
 588     # JCPS must not be a contraction, then it's a code point.
 589     if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
 590         my $hang = $self->{overrideHangul};
 591         my @hangulCE;
 592         if ($hang) {
 593             @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u));
 594         }
 595         elsif (!defined $hang) {
 596             @hangulCE = $der->($u);
 597         }
 598         else {
 599             my $max  = $self->{maxlength};
 600             my @decH = _decompHangul($u);
 601
 602             if (@decH == 2) {
 603                 my $contract = join(CODE_SEP, @decH);
 604                 @decH = ($contract) if $map->{$contract};
 605             } else { # must be <@decH == 3>
 606                 if ($max->{$decH[0]}) {
 607                     my $contract = join(CODE_SEP, @decH);
 608                     if ($map->{$contract}) {
 609                         @decH = ($contract);
 610                     } else {
 611                         $contract = join(CODE_SEP, @decH[0,1]);
 612                         $map->{$contract} and @decH = ($contract, $decH[2]);
 613                     }
 614                     # even if V's ignorable, LT contraction is not supported.
 615                     # If such a situatution were required, NFD should be used.
 616                 }
 617                 if (@decH == 3 && $max->{$decH[1]}) {
 618                     my $contract = join(CODE_SEP, @decH[1,2]);
 619                     $map->{$contract} and @decH = ($decH[0], $contract);
 620                 }
 621             }
 622
 623             @hangulCE = map({
 624                     $map->{$_} ? @{ $map->{$_} } : $der->($_);
 625                 } @decH);
 626         }
 627         return map _varCE($vbl, $_), @hangulCE;
 628     }
 629     elsif (_isUIdeo($u, $self->{UCA_Version})) {
 630         my $cjk  = $self->{overrideCJK};
 631         return map _varCE($vbl, $_),
 632             $cjk
 633                 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u))
 634                 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000
 635                     ? _uideoCE_8($u)
 636                     : $der->($u);
 637     }
 638     else {
 639         return map _varCE($vbl, $_), $der->($u);
 640     }
 641 }
 642
 643
 644 ##
 645 ## string sortkey = getSortKey(string arg)
 646 ##
 647 sub getSortKey
 648 {
 649     my $self = shift;
 650     my $lev  = $self->{level};
 651     my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
 652     my $v2i  = $self->{UCA_Version} >= 9 &&
 653                 $self->{variable} ne 'non-ignorable';
 654
 655     my @buf; # weight arrays
 656     if ($self->{hangul_terminator}) {
 657         my $preHST = '';
 658         foreach my $jcps (@$rEnt) {
 659             # weird things like VL, TL-contraction are not considered!
 660             my $curHST = '';
 661             foreach my $u (split /;/, $jcps) {
 662                 $curHST .= getHST($u);
 663             }
 664             if ($preHST && !$curHST || # hangul before non-hangul
 665                 $preHST =~ /L\z/ && $curHST =~ /^T/ ||
 666                 $preHST =~ /V\z/ && $curHST =~ /^L/ ||
 667                 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
 668
 669                 push @buf, $self->getWtHangulTerm();
 670             }
 671             $preHST = $curHST;
 672
 673             push @buf, $self->getWt($jcps);
 674         }
 675         $preHST # end at hangul
 676             and push @buf, $self->getWtHangulTerm();
 677     }
 678     else {
 679         foreach my $jcps (@$rEnt) {
 680             push @buf, $self->getWt($jcps);
 681         }
 682     }
 683
 684     # make sort key
 685     my @ret = ([],[],[],[]);
 686     my $last_is_variable;
 687
 688     foreach my $vwt (@buf) {
 689         my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 690
 691         # "Ignorable (L1, L2) after Variable" since track. v. 9
 692         if ($v2i) {
 693             if ($var) {
 694                 $last_is_variable = TRUE;
 695             }
 696             elsif (!$wt[0]) { # ignorable
 697                 next if $last_is_variable;
 698             }
 699             else {
 700                 $last_is_variable = FALSE;
 701             }
 702         }
 703         foreach my $v (0..$lev-1) {
 704             0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
 705         }
 706     }
 707
 708     # modification of tertiary weights
 709     if ($self->{upper_before_lower}) {
 710         foreach my $w (@{ $ret[2] }) {
 711             if    (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
 712             elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
 713             elsif ($w == 0x1C)             { $w += 1 } # square upper
 714             elsif ($w == 0x1D)             { $w -= 1 } # square lower
 715         }
 716     }
 717     if ($self->{katakana_before_hiragana}) {
 718         foreach my $w (@{ $ret[2] }) {
 719             if    (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
 720             elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
 721         }
 722     }
 723
 724     if ($self->{backwardsFlag}) {
 725         for (my $v = MinLevel; $v <= MaxLevel; $v++) {
 726             if ($self->{backwardsFlag} & (1 << $v)) {
 727                 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
 728             }
 729         }
 730     }
 731
 732     join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
 733 }
 734
 735
 736 ##
 737 ## int compare = cmp(string a, string b)
 738 ##
 739 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
 740 sub eq  { $_[0]->getSortKey($_[1]) eq  $_[0]->getSortKey($_[2]) }
 741 sub ne  { $_[0]->getSortKey($_[1]) ne  $_[0]->getSortKey($_[2]) }
 742 sub lt  { $_[0]->getSortKey($_[1]) lt  $_[0]->getSortKey($_[2]) }
 743 sub le  { $_[0]->getSortKey($_[1]) le  $_[0]->getSortKey($_[2]) }
 744 sub gt  { $_[0]->getSortKey($_[1]) gt  $_[0]->getSortKey($_[2]) }
 745 sub ge  { $_[0]->getSortKey($_[1]) ge  $_[0]->getSortKey($_[2]) }
 746
 747 ##
 748 ## list[strings] sorted = sort(list[strings] arg)
 749 ##
 750 sub sort {
 751     my $obj = shift;
 752     return
 753         map { $_->[1] }
 754             sort{ $a->[0] cmp $b->[0] }
 755                 map [ $obj->getSortKey($_), $_ ], @_;
 756 }
 757
 758
 759 sub _derivCE_14 {
 760     my $u = shift;
 761     my $base =
 762         (CJK_UidIni  <= $u && $u <= CJK_UidF41)
 763             ? 0xFB40 : # CJK
 764         (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 765          CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 766             ? 0xFB80   # CJK ext.
 767             : 0xFBC0;  # others
 768
 769     my $aaaa = $base + ($u >> 15);
 770     my $bbbb = ($u & 0x7FFF) | 0x8000;
 771     return
 772         pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 773         pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 774 }
 775
 776 sub _derivCE_9 {
 777     my $u = shift;
 778     my $base =
 779         (CJK_UidIni  <= $u && $u <= CJK_UidFin)
 780             ? 0xFB40 : # CJK
 781         (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
 782          CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 783             ? 0xFB80   # CJK ext.
 784             : 0xFBC0;  # others
 785
 786     my $aaaa = $base + ($u >> 15);
 787     my $bbbb = ($u & 0x7FFF) | 0x8000;
 788     return
 789         pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
 790         pack(VCE_TEMPLATE, NON_VAR, $bbbb,      0,      0, $u);
 791 }
 792
 793 sub _derivCE_8 {
 794     my $code = shift;
 795     my $aaaa =  0xFF80 + ($code >> 15);
 796     my $bbbb = ($code & 0x7FFF) | 0x8000;
 797     return
 798         pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
 799         pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
 800 }
 801
 802 sub _uideoCE_8 {
 803     my $u = shift;
 804     return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
 805 }
 806
 807 sub _isUIdeo {
 808     my ($u, $uca_vers) = @_;
 809     return(
 810         (CJK_UidIni <= $u &&
 811             ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin)))
 812                 ||
 813         (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
 814                 ||
 815         (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
 816     );
 817 }
 818
 819
 820 sub getWtHangulTerm {
 821     my $self = shift;
 822     return _varCE($self->{variable},
 823         pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0));
 824 }
 825
 826
 827 ##
 828 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
 829 ##
 830 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
 831
 832 #
 833 # $code *must* be in Hangul syllable.
 834 # Check it before you enter here.
 835 #
 836 sub _decompHangul {
 837     my $code = shift;
 838     my $si = $code - Hangul_SBase;
 839     my $li = int( $si / Hangul_NCount);
 840     my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
 841     my $ti =      $si % Hangul_TCount;
 842     return (
 843         Hangul_LBase + $li,
 844         Hangul_VBase + $vi,
 845         $ti ? (Hangul_TBase + $ti) : (),
 846     );
 847 }
 848
 849 sub _isIllegal {
 850     my $code = shift;
 851     return ! defined $code                      # removed
 852         || ($code < 0 || 0x10FFFF < $code)      # out of range
 853         || (($code & 0xFFFE) == 0xFFFE)         # ??FFF[EF] (cf. utf8.c)
 854         || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
 855         || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
 856     ;
 857 }
 858
 859 # Hangul Syllable Type
 860 sub getHST {
 861     my $u = shift;
 862     return
 863         Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" :
 864         Hangul_VIni <= $u && $u <= Hangul_VFin       ? "V" :
 865         Hangul_TIni <= $u && $u <= Hangul_TFin       ? "T" :
 866         Hangul_SIni <= $u && $u <= Hangul_SFin ?
 867             ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : "";
 868 }
 869
 870
 871 ##
 872 ## bool _nonIgnorAtLevel(arrayref weights, int level)
 873 ##
 874 sub _nonIgnorAtLevel($$)
 875 {
 876     my $wt = shift;
 877     return if ! defined $wt;
 878     my $lv = shift;
 879     return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
 880 }
 881
 882 ##
 883 ## bool _eqArray(
 884 ##    arrayref of arrayref[weights] source,
 885 ##    arrayref of arrayref[weights] substr,
 886 ##    int level)
 887 ## * comparison of graphemes vs graphemes.
 888 ##   @$source >= @$substr must be true (check it before call this);
 889 ##
 890 sub _eqArray($$$)
 891 {
 892     my $source = shift;
 893     my $substr = shift;
 894     my $lev = shift;
 895
 896     for my $g (0..@$substr-1){
 897         # Do the $g'th graphemes have the same number of AV weigths?
 898         return if @{ $source->[$g] } != @{ $substr->[$g] };
 899
 900         for my $w (0..@{ $substr->[$g] }-1) {
 901             for my $v (0..$lev-1) {
 902                 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
 903             }
 904         }
 905     }
 906     return 1;
 907 }
 908
 909 ##
 910 ## (int position, int length)
 911 ## int position = index(string, substring, position, [undoc'ed grobal])
 912 ##
 913 ## With "grobal" (only for the list context),
 914 ##  returns list of arrayref[position, length].
 915 ##
 916 sub index
 917 {
 918     my $self = shift;
 919     my $str  = shift;
 920     my $len  = length($str);
 921     my $subE = $self->splitEnt(shift);
 922     my $pos  = @_ ? shift : 0;
 923        $pos  = 0 if $pos < 0;
 924     my $grob = shift;
 925
 926     my $lev  = $self->{level};
 927     my $v2i  = $self->{UCA_Version} >= 9 &&
 928                 $self->{variable} ne 'non-ignorable';
 929
 930     if (! @$subE) {
 931         my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
 932         return $grob
 933             ? map([$_, 0], $temp..$len)
 934             : wantarray ? ($temp,0) : $temp;
 935     }
 936     $len < $pos
 937         and return wantarray ? () : NOMATCHPOS;
 938     my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
 939     @$strE
 940         or return wantarray ? () : NOMATCHPOS;
 941
 942     my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
 943
 944     my $last_is_variable;
 945     for my $vwt (map $self->getWt($_), @$subE) {
 946         my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 947         my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
 948
 949         # "Ignorable (L1, L2) after Variable" since track. v. 9
 950         if ($v2i) {
 951             if ($var) {
 952                 $last_is_variable = TRUE;
 953             }
 954             elsif (!$wt[0]) { # ignorable
 955                 $to_be_pushed = FALSE if $last_is_variable;
 956             }
 957             else {
 958                 $last_is_variable = FALSE;
 959             }
 960         }
 961
 962         if (@subWt && !$var && !$wt[0]) {
 963             push @{ $subWt[-1] }, \@wt if $to_be_pushed;
 964         } else {
 965             push @subWt, [ \@wt ];
 966         }
 967     }
 968
 969     my $count = 0;
 970     my $end = @$strE - 1;
 971
 972     $last_is_variable = FALSE; # reuse
 973     for (my $i = 0; $i <= $end; ) { # no $i++
 974         my $found_base = 0;
 975
 976         # fetch a grapheme
 977         while ($i <= $end && $found_base == 0) {
 978             for my $vwt ($self->getWt($strE->[$i][0])) {
 979                 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
 980                 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
 981
 982                 # "Ignorable (L1, L2) after Variable" since track. v. 9
 983                 if ($v2i) {
 984                     if ($var) {
 985                         $last_is_variable = TRUE;
 986                     }
 987                     elsif (!$wt[0]) { # ignorable
 988                         $to_be_pushed = FALSE if $last_is_variable;
 989                     }
 990                     else {
 991                         $last_is_variable = FALSE;
 992                     }
 993                 }
 994
 995                 if (@strWt && !$var && !$wt[0]) {
 996                     push @{ $strWt[-1] }, \@wt if $to_be_pushed;
 997                     $finPos[-1] = $strE->[$i][2];
 998                 } elsif ($to_be_pushed) {
 999                     push @strWt, [ \@wt ];
1000                     push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
1001                     $finPos[-1] = NOMATCHPOS if $found_base;
1002                     push @finPos, $strE->[$i][2];
1003                     $found_base++;
1004                 }
1005                 # else ===> no-op
1006             }
1007             $i++;
1008         }
1009
1010         # try to match
1011         while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) {
1012             if ($iniPos[0] != NOMATCHPOS &&
1013                     $finPos[$#subWt] != NOMATCHPOS &&
1014                         _eqArray(\@strWt, \@subWt, $lev)) {
1015                 my $temp = $iniPos[0] + $pos;
1016
1017                 if ($grob) {
1018                     push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
1019                     splice @strWt,  0, $#subWt;
1020                     splice @iniPos, 0, $#subWt;
1021                     splice @finPos, 0, $#subWt;
1022                 }
1023                 else {
1024                     return wantarray
1025                         ? ($temp, $finPos[$#subWt] - $iniPos[0])
1026                         :  $temp;
1027                 }
1028             }
1029             shift @strWt;
1030             shift @iniPos;
1031             shift @finPos;
1032         }
1033     }
1034
1035     return $grob
1036         ? @g_ret
1037         : wantarray ? () : NOMATCHPOS;
1038 }
1039
1040 ##
1041 ## scalarref to matching part = match(string, substring)
1042 ##
1043 sub match
1044 {
1045     my $self = shift;
1046     if (my($pos,$len) = $self->index($_[0], $_[1])) {
1047         my $temp = substr($_[0], $pos, $len);
1048         return wantarray ? $temp : \$temp;
1049         # An lvalue ref \substr should be avoided,
1050         # since its value is affected by modification of its referent.
1051     }
1052     else {
1053         return;
1054     }
1055 }
1056
1057 ##
1058 ## arrayref matching parts = gmatch(string, substring)
1059 ##
1060 sub gmatch
1061 {
1062     my $self = shift;
1063     my $str  = shift;
1064     my $sub  = shift;
1065     return map substr($str, $_->[0], $_->[1]),
1066                 $self->index($str, $sub, 0, 'g');
1067 }
1068
1069 ##
1070 ## bool subst'ed = subst(string, substring, replace)
1071 ##
1072 sub subst
1073 {
1074     my $self = shift;
1075     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1076
1077     if (my($pos,$len) = $self->index($_[0], $_[1])) {
1078         if ($code) {
1079             my $mat = substr($_[0], $pos, $len);
1080             substr($_[0], $pos, $len, $code->($mat));
1081         } else {
1082             substr($_[0], $pos, $len, $_[2]);
1083         }
1084         return TRUE;
1085     }
1086     else {
1087         return FALSE;
1088     }
1089 }
1090
1091 ##
1092 ## int count = gsubst(string, substring, replace)
1093 ##
1094 sub gsubst
1095 {
1096     my $self = shift;
1097     my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
1098     my $cnt = 0;
1099
1100     # Replacement is carried out from the end, then use reverse.
1101     for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
1102         if ($code) {
1103             my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
1104             substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
1105         } else {
1106             substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
1107         }
1108         $cnt++;
1109     }
1110     return $cnt;
1111 }
1112
1113 1;
1114 __END__
1115
1116 =head1 NAME
1117
1118 Unicode::Collate - Unicode Collation Algorithm
1119
1120 =head1 SYNOPSIS
1121
1122   use Unicode::Collate;
1123
1124   #construct
1125   $Collator = Unicode::Collate->new(%tailoring);
1126
1127   #sort
1128   @sorted = $Collator->sort(@not_sorted);
1129
1130   #compare
1131   $result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
1132
1133   # If %tailoring is false (i.e. empty),
1134   # $Collator should do the default collation.
1135
1136 =head1 DESCRIPTION
1137
1138 This module is an implementation of Unicode Technical Standard #10
1139 (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
1140
1141 =head2 Constructor and Tailoring
1142
1143 The C<new> method returns a collator object.
1144
1145    $Collator = Unicode::Collate->new(
1146       UCA_Version => $UCA_Version,
1147       alternate => $alternate, # deprecated: use of 'variable' is recommended.
1148       backwards => $levelNumber, # or \@levelNumbers
1149       entry => $element,
1150       hangul_terminator => $term_primary_weight,
1151       ignoreName => qr/$ignoreName/,
1152       ignoreChar => qr/$ignoreChar/,
1153       katakana_before_hiragana => $bool,
1154       level => $collationLevel,
1155       normalization  => $normalization_form,
1156       overrideCJK => \&overrideCJK,
1157       overrideHangul => \&overrideHangul,
1158       preprocess => \&preprocess,
1159       rearrange => \@charList,
1160       table => $filename,
1161       undefName => qr/$undefName/,
1162       undefChar => qr/$undefChar/,
1163       upper_before_lower => $bool,
1164       variable => $variable,
1165    );
1166
1167 =over 4
1168
1169 =item UCA_Version
1170
1171 If the tracking version number of UCA is given,
1172 behavior of that tracking version is emulated on collating.
1173 If omitted, the return value of C<UCA_Version()> is used.
1174 C<UCA_Version()> should return the latest tracking version supported.
1175
1176 The supported tracking version: 8, 9, 11, or 14.
1177
1178      UCA       Unicode Standard         DUCET (@version)
1179      ---------------------------------------------------
1180       8              3.1                3.0.1 (3.0.1d9)
1181       9     3.1 with Corrigendum 3      3.1.1 (3.1.1)
1182      11              4.0                4.0.0 (4.0.0)
1183      14             4.1.0               4.1.0 (4.1.0)
1184
1185 Note: Recent UTS #10 renames "Tracking Version" to "Revision."
1186
1187 =item alternate
1188
1189 -- see 3.2.2 Alternate Weighting, version 8 of UTS #10
1190
1191 For backward compatibility, C<alternate> (old name) can be used
1192 as an alias for C<variable>.
1193
1194 =item backwards
1195
1196 -- see 3.1.2 French Accents, UTS #10.
1197
1198      backwards => $levelNumber or \@levelNumbers
1199
1200 Weights in reverse order; ex. level 2 (diacritic ordering) in French.
1201 If omitted, forwards at all the levels.
1202
1203 =item entry
1204
1205 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10.
1206
1207 If the same character (or a sequence of characters) exists
1208 in the collation element table through C<table>,
1209 mapping to collation elements is overrided.
1210 If it does not exist, the mapping is defined additionally.
1211
1212     entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1213 0063 0068 ; [.0E6A.0020.0002.0063] # ch
1214 0043 0068 ; [.0E6A.0020.0007.0043] # Ch
1215 0043 0048 ; [.0E6A.0020.0008.0043] # CH
1216 006C 006C ; [.0F4C.0020.0002.006C] # ll
1217 004C 006C ; [.0F4C.0020.0007.004C] # Ll
1218 004C 004C ; [.0F4C.0020.0008.004C] # LL
1219 00F1      ; [.0F7B.0020.0002.00F1] # n-tilde
1220 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
1221 00D1      ; [.0F7B.0020.0008.00D1] # N-tilde
1222 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
1223 ENTRY
1224
1225     entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
1226 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
1227 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
1228 ENTRY
1229
1230 B<NOTE:> The code point in the UCA file format (before C<';'>)
1231 B<must> be a Unicode code point (defined as hexadecimal),
1232 but not a native code point.
1233 So C<0063> must always denote C<U+0063>,
1234 but not a character of C<"\x63">.
1235
1236 Weighting may vary depending on collation element table.
1237 So ensure the weights defined in C<entry> will be consistent with
1238 those in the collation element table loaded via C<table>.
1239
1240 In DUCET v4.0.0, primary weight of C<C> is C<0E60>
1241 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
1242 (as a value between C<0E60> and C<0E6D>)
1243 makes ordering as C<C E<lt> CH E<lt> D>.
1244 Exactly speaking DUCET already has some characters between C<C> and C<D>:
1245 C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
1246 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
1247 and C<c-curl> (C<U+0255>) with C<0E69>.
1248 Then primary weight C<0E6A> for C<CH> makes C<CH>
1249 ordered between C<c-curl> and C<D>.
1250
1251 =item hangul_terminator
1252
1253 -- see 7.1.4 Trailing Weights, UTS #10.
1254
1255 If a true value is given (non-zero but should be positive),
1256 it will be added as a terminator primary weight to the end of
1257 every standard Hangul syllable. Secondary and any higher weights
1258 for terminator are set to zero.
1259 If the value is false or C<hangul_terminator> key does not exist,
1260 insertion of terminator weights will not be performed.
1261
1262 Boundaries of Hangul syllables are determined
1263 according to conjoining Jamo behavior in F<the Unicode Standard>
1264 and F<HangulSyllableType.txt>.
1265
1266 B<Implementation Note:>
1267 (1) For expansion mapping (Unicode character mapped
1268 to a sequence of collation elements), a terminator will not be added
1269 between collation elements, even if Hangul syllable boundary exists there.
1270 Addition of terminator is restricted to the next position
1271 to the last collation element.
1272
1273 (2) Non-conjoining Hangul letters
1274 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
1275 automatically terminated with a terminator primary weight.
1276 These characters may need terminator included in a collation element
1277 table beforehand.
1278
1279 =item ignoreChar
1280
1281 =item ignoreName
1282
1283 -- see 3.2.2 Variable Weighting, UTS #10.
1284
1285 Makes the entry in the table completely ignorable;
1286 i.e. as if the weights were zero at all level.
1287
1288 Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
1289 will be ignored. Through C<ignoreName>, any character whose name
1290 (given in the C<table> file as a comment) matches C<qr/$ignoreName/>
1291 will be ignored.
1292
1293 E.g. when 'a' and 'e' are ignorable,
1294 'element' is equal to 'lament' (or 'lmnt').
1295
1296 =item katakana_before_hiragana
1297
1298 -- see 7.3.1 Tertiary Weight Table, UTS #10.
1299
1300 By default, hiragana is before katakana.
1301 If the parameter is made true, this is reversed.
1302
1303 B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
1304 distinctions must occur in level 3, and their weights at level 3 must be
1305 same as those mentioned in 7.3.1, UTS #10.
1306 If you define your collation elements which violate this requirement,
1307 this parameter does not work validly.
1308
1309 =item level
1310
1311 -- see 4.3 Form Sort Key, UTS #10.
1312
1313 Set the maximum level.
1314 Any higher levels than the specified one are ignored.
1315
1316   Level 1: alphabetic ordering
1317   Level 2: diacritic ordering
1318   Level 3: case ordering
1319   Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
1320
1321   ex.level => 2,
1322
1323 If omitted, the maximum is the 4th.
1324
1325 =item normalization
1326
1327 -- see 4.1 Normalize, UTS #10.
1328
1329 If specified, strings are normalized before preparation of sort keys
1330 (the normalization is executed after preprocess).
1331
1332 A form name C<Unicode::Normalize::normalize()> accepts will be applied
1333 as C<$normalization_form>.
1334 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
1335 See C<Unicode::Normalize::normalize()> for detail.
1336 If omitted, C<'NFD'> is used.
1337
1338 C<normalization> is performed after C<preprocess> (if defined).
1339
1340 Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
1341 though they are not concerned with C<Unicode::Normalize::normalize()>.
1342
1343 If C<undef> (not a string C<"undef">) is passed explicitly
1344 as the value for this key,
1345 any normalization is not carried out (this may make tailoring easier
1346 if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
1347 only contiguous contractions are resolved;
1348 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
1349 C<A-cedilla-ring> would be primary equal to C<A>.
1350 In this point,
1351 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
1352 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
1353
1354 In the case of C<(normalization =E<gt> "prenormalized")>,
1355 any normalization is not performed, but
1356 non-contiguous contractions with combining characters are performed.
1357 Therefore
1358 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
1359 B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
1360 If source strings are finely prenormalized,
1361 C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
1362
1363 Except C<(normalization =E<gt> undef)>,
1364 B<Unicode::Normalize> is required (see also B<CAVEAT>).
1365
1366 =item overrideCJK
1367
1368 -- see 7.1 Derived Collation Elements, UTS #10.
1369
1370 By default, CJK Unified Ideographs are ordered in Unicode codepoint order
1371 but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is
1372 C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>)
1373 are lesser than C<CJK Unified Ideographs Extension> (its range is
1374 C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>).
1375
1376 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided.
1377
1378 ex. CJK Unified Ideographs in the JIS code point order.
1379
1380   overrideCJK => sub {
1381       my $u = shift;             # get a Unicode codepoint
1382       my $b = pack('n', $u);     # to UTF-16BE
1383       my $s = your_unicode_to_sjis_converter($b); # convert
1384       my $n = unpack('n', $s);   # convert sjis to short
1385       [ $n, 0x20, 0x2, $u ];     # return the collation element
1386   },
1387
1388 ex. ignores all CJK Unified Ideographs.
1389
1390   overrideCJK => sub {()}, # CODEREF returning empty list
1391
1392    # where ->eq("Pe\x{4E00}rl", "Perl") is true
1393    # as U+4E00 is a CJK Unified Ideograph and to be ignorable.
1394
1395 If C<undef> is passed explicitly as the value for this key,
1396 weights for CJK Unified Ideographs are treated as undefined.
1397 But assignment of weight for CJK Unified Ideographs
1398 in table or C<entry> is still valid.
1399
1400 =item overrideHangul
1401
1402 -- see 7.1 Derived Collation Elements, UTS #10.
1403
1404 By default, Hangul Syllables are decomposed into Hangul Jamo,
1405 even if C<(normalization =E<gt> undef)>.
1406 But the mapping of Hangul Syllables may be overrided.
1407
1408 This parameter works like C<overrideCJK>, so see there for examples.
1409
1410 If you want to override the mapping of Hangul Syllables,
1411 NFD, NFKD, and FCD are not appropriate,
1412 since they will decompose Hangul Syllables before overriding.
1413
1414 If C<undef> is passed explicitly as the value for this key,
1415 weight for Hangul Syllables is treated as undefined
1416 without decomposition into Hangul Jamo.
1417 But definition of weight for Hangul Syllables
1418 in table or C<entry> is still valid.
1419
1420 =item preprocess
1421
1422 -- see 5.1 Preprocessing, UTS #10.
1423
1424 If specified, the coderef is used to preprocess
1425 before the formation of sort keys.
1426
1427 ex. dropping English articles, such as "a" or "the".
1428 Then, "the pen" is before "a pencil".
1429
1430      preprocess => sub {
1431            my $str = shift;
1432            $str =~ s/\b(?:an?|the)\s+//gi;
1433            return $str;
1434         },
1435
1436 C<preprocess> is performed before C<normalization> (if defined).
1437
1438 =item rearrange
1439
1440 -- see 3.1.3 Rearrangement, UTS #10.
1441
1442 Characters that are not coded in logical order and to be rearranged.
1443 If C<UCA_Version> is equal to or lesser than 11, default is:
1444
1445     rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
1446
1447 If you want to disallow any rearrangement, pass C<undef> or C<[]>
1448 (a reference to empty list) as the value for this key.
1449
1450 If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement).
1451
1452 B<According to the version 9 of UCA, this parameter shall not be used;
1453 but it is not warned at present.>
1454
1455 =item table
1456
1457 -- see 3.2 Default Unicode Collation Element Table, UTS #10.
1458
1459 You can use another collation element table if desired.
1460
1461 The table file should locate in the F<Unicode/Collate> directory
1462 on C<@INC>. Say, if the filename is F<Foo.txt>,
1463 the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
1464
1465 By default, F<allkeys.txt> (as the filename of DUCET) is used.
1466 If you will prepare your own table file, any name other than F<allkeys.txt>
1467 may be better to avoid namespace conflict.
1468
1469 If C<undef> is passed explicitly as the value for this key,
1470 no file is read (but you can define collation elements via C<entry>).
1471
1472 A typical way to define a collation element table
1473 without any file of table:
1474
1475    $onlyABC = Unicode::Collate->new(
1476        table => undef,
1477        entry => << 'ENTRIES',
1478 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
1479 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
1480 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
1481 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
1482 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
1483 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
1484 ENTRIES
1485     );
1486
1487 If C<ignoreName> or C<undefName> is used, character names should be
1488 specified as a comment (following C<#>) on each line.
1489
1490 =item undefChar
1491
1492 =item undefName
1493
1494 -- see 6.3.4 Reducing the Repertoire, UTS #10.
1495
1496 Undefines the collation element as if it were unassigned in the table.
1497 This reduces the size of the table.
1498 If an unassigned character appears in the string to be collated,
1499 the sort key is made from its codepoint
1500 as a single-character collation element,
1501 as it is greater than any other assigned collation elements
1502 (in the codepoint order among the unassigned characters).
1503 But, it'd be better to ignore characters
1504 unfamiliar to you and maybe never used.
1505
1506 Through C<undefChar>, any character matching C<qr/$undefChar/>
1507 will be undefined. Through C<undefName>, any character whose name
1508 (given in the C<table> file as a comment) matches C<qr/$undefName/>
1509 will be undefined.
1510
1511 ex. Collation weights for beyond-BMP characters are not stored in object:
1512
1513     undefChar => qr/[^\0-\x{fffd}]/,
1514
1515 =item upper_before_lower
1516
1517 -- see 6.6 Case Comparisons, UTS #10.
1518
1519 By default, lowercase is before uppercase.
1520 If the parameter is made true, this is reversed.
1521
1522 B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
1523 distinctions must occur in level 3, and their weights at level 3 must be
1524 same as those mentioned in 7.3.1, UTS #10.
1525 If you define your collation elements which differs from this requirement,
1526 this parameter doesn't work validly.
1527
1528 =item variable
1529
1530 -- see 3.2.2 Variable Weighting, UTS #10.
1531
1532 This key allows to variable weighting for variable collation elements,
1533 which are marked with an ASTERISK in the table
1534 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
1535
1536    variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
1537
1538 These names are case-insensitive.
1539 By default (if specification is omitted), 'shifted' is adopted.
1540
1541    'Blanked'        Variable elements are made ignorable at levels 1 through 3;
1542                     considered at the 4th level.
1543
1544    'Non-Ignorable'  Variable elements are not reset to ignorable.
1545
1546    'Shifted'        Variable elements are made ignorable at levels 1 through 3
1547                     their level 4 weight is replaced by the old level 1 weight.
1548                     Level 4 weight for Non-Variable elements is 0xFFFF.
1549
1550    'Shift-Trimmed'  Same as 'shifted', but all FFFF's at the 4th level
1551                     are trimmed.
1552
1553 =back
1554
1555 =head2 Methods for Collation
1556
1557 =over 4
1558
1559 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
1560
1561 Sorts a list of strings.
1562
1563 =item C<$result = $Collator-E<gt>cmp($a, $b)>
1564
1565 Returns 1 (when C<$a> is greater than C<$b>)
1566 or 0 (when C<$a> is equal to C<$b>)
1567 or -1 (when C<$a> is lesser than C<$b>).
1568
1569 =item C<$result = $Collator-E<gt>eq($a, $b)>
1570
1571 =item C<$result = $Collator-E<gt>ne($a, $b)>
1572
1573 =item C<$result = $Collator-E<gt>lt($a, $b)>
1574
1575 =item C<$result = $Collator-E<gt>le($a, $b)>
1576
1577 =item C<$result = $Collator-E<gt>gt($a, $b)>
1578
1579 =item C<$result = $Collator-E<gt>ge($a, $b)>
1580
1581 They works like the same name operators as theirs.
1582
1583    eq : whether $a is equal to $b.
1584    ne : whether $a is not equal to $b.
1585    lt : whether $a is lesser than $b.
1586    le : whether $a is lesser than $b or equal to $b.
1587    gt : whether $a is greater than $b.
1588    ge : whether $a is greater than $b or equal to $b.
1589
1590 =item C<$sortKey = $Collator-E<gt>getSortKey($string)>
1591
1592 -- see 4.3 Form Sort Key, UTS #10.
1593
1594 Returns a sort key.
1595
1596 You compare the sort keys using a binary comparison
1597 and get the result of the comparison of the strings using UCA.
1598
1599    $Collator->getSortKey($a) cmp $Collator->getSortKey($b)
1600
1601       is equivalent to
1602
1603    $Collator->cmp($a, $b)
1604
1605 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
1606
1607 Converts a sorting key into its representation form.
1608 If C<UCA_Version> is 8, the output is slightly different.
1609
1610    use Unicode::Collate;
1611    my $c = Unicode::Collate->new();
1612    print $c->viewSortKey("Perl"),"\n";
1613
1614    # output:
1615    # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF]
1616    #  Level 1               Level 2               Level 3               Level 4
1617
1618 =back
1619
1620 =head2 Methods for Searching
1621
1622 B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
1623 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
1624 C<subst>, C<gsubst>) is croaked,
1625 as the position and the length might differ
1626 from those on the specified string.
1627 (And C<rearrange> and C<hangul_terminator> parameters are neglected.)
1628
1629 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
1630 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
1631 but they are not aware of any pattern, but only a literal substring.
1632
1633 =over 4
1634
1635 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
1636
1637 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
1638
1639 If C<$substring> matches a part of C<$string>, returns
1640 the position of the first occurrence of the matching part in scalar context;
1641 in list context, returns a two-element list of
1642 the position and the length of the matching part.
1643
1644 If C<$substring> does not match any part of C<$string>,
1645 returns C<-1> in scalar context and
1646 an empty list in list context.
1647
1648 e.g. you say
1649
1650   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1651                                      # (normalization => undef) is REQUIRED.
1652   my $str = "Ich muß studieren Perl.";
1653   my $sub = "MÜSS";
1654   my $match;
1655   if (my($pos,$len) = $Collator->index($str, $sub)) {
1656       $match = substr($str, $pos, $len);
1657   }
1658
1659 and get C<"muß"> in C<$match> since C<"muß">
1660 is primary equal to C<"MÜSS">.
1661
1662 =item C<$match_ref = $Collator-E<gt>match($string, $substring)>
1663
1664 =item C<($match)   = $Collator-E<gt>match($string, $substring)>
1665
1666 If C<$substring> matches a part of C<$string>, in scalar context, returns
1667 B<a reference to> the first occurrence of the matching part
1668 (C<$match_ref> is always true if matches,
1669 since every reference is B<true>);
1670 in list context, returns the first occurrence of the matching part.
1671
1672 If C<$substring> does not match any part of C<$string>,
1673 returns C<undef> in scalar context and
1674 an empty list in list context.
1675
1676 e.g.
1677
1678     if ($match_ref = $Collator->match($str, $sub)) { # scalar context
1679         print "matches [$$match_ref].\n";
1680     } else {
1681         print "doesn't match.\n";
1682     }
1683
1684      or
1685
1686     if (($match) = $Collator->match($str, $sub)) { # list context
1687         print "matches [$match].\n";
1688     } else {
1689         print "doesn't match.\n";
1690     }
1691
1692 =item C<@match = $Collator-E<gt>gmatch($string, $substring)>
1693
1694 If C<$substring> matches a part of C<$string>, returns
1695 all the matching parts (or matching count in scalar context).
1696
1697 If C<$substring> does not match any part of C<$string>,
1698 returns an empty list.
1699
1700 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
1701
1702 If C<$substring> matches a part of C<$string>,
1703 the first occurrence of the matching part is replaced by C<$replacement>
1704 (C<$string> is modified) and return C<$count> (always equals to C<1>).
1705
1706 C<$replacement> can be a C<CODEREF>,
1707 taking the matching part as an argument,
1708 and returning a string to replace the matching part
1709 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
1710
1711 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
1712
1713 If C<$substring> matches a part of C<$string>,
1714 all the occurrences of the matching part is replaced by C<$replacement>
1715 (C<$string> is modified) and return C<$count>.
1716
1717 C<$replacement> can be a C<CODEREF>,
1718 taking the matching part as an argument,
1719 and returning a string to replace the matching part
1720 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
1721
1722 e.g.
1723
1724   my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
1725                                      # (normalization => undef) is REQUIRED.
1726   my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
1727   $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
1728
1729   # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
1730   # i.e., all the camels are made bold-faced.
1731
1732 =back
1733
1734 =head2 Other Methods
1735
1736 =over 4
1737
1738 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
1739
1740 Change the value of specified keys and returns the changed part.
1741
1742     $Collator = Unicode::Collate->new(level => 4);
1743
1744     $Collator->eq("perl", "PERL"); # false
1745
1746     %old = $Collator->change(level => 2); # returns (level => 4).
1747
1748     $Collator->eq("perl", "PERL"); # true
1749
1750     $Collator->change(%old); # returns (level => 2).
1751
1752     $Collator->eq("perl", "PERL"); # false
1753
1754 Not all C<(key,value)>s are allowed to be changed.
1755 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
1756
1757 In the scalar context, returns the modified collator
1758 (but it is B<not> a clone from the original).
1759
1760     $Collator->change(level => 2)->eq("perl", "PERL"); # true
1761
1762     $Collator->eq("perl", "PERL"); # true; now max level is 2nd.
1763
1764     $Collator->change(level => 4)->eq("perl", "PERL"); # false
1765
1766 =item C<$version = $Collator-E<gt>version()>
1767
1768 Returns the version number (a string) of the Unicode Standard
1769 which the C<table> file used by the collator object is based on.
1770 If the table does not include a version line (starting with C<@version>),
1771 returns C<"unknown">.
1772
1773 =item C<UCA_Version()>
1774
1775 Returns the tracking version number of UTS #10 this module consults.
1776
1777 =item C<Base_Unicode_Version()>
1778
1779 Returns the version number of UTS #10 this module consults.
1780
1781 =back
1782
1783 =head1 EXPORT
1784
1785 No method will be exported.
1786
1787 =head1 INSTALL
1788
1789 Though this module can be used without any C<table> file,
1790 to use this module easily, it is recommended to install a table file
1791 in the UCA format, by copying it under the directory
1792 <a place in @INC>/Unicode/Collate.
1793
1794 The most preferable one is "The Default Unicode Collation Element Table"
1795 (aka DUCET), available from the Unicode Consortium's website:
1796
1797    http://www.unicode.org/Public/UCA/
1798
1799    http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
1800
1801 If DUCET is not installed, it is recommended to copy the file
1802 from http://www.unicode.org/Public/UCA/latest/allkeys.txt
1803 to <a place in @INC>/Unicode/Collate/allkeys.txt
1804 manually.
1805
1806 =head1 CAVEATS
1807
1808 =over 4
1809
1810 =item Normalization
1811
1812 Use of the C<normalization> parameter requires the B<Unicode::Normalize>
1813 module (see L<Unicode::Normalize>).
1814
1815 If you need not it (say, in the case when you need not
1816 handle any combining characters),
1817 assign C<normalization =E<gt> undef> explicitly.
1818
1819 -- see 6.5 Avoiding Normalization, UTS #10.
1820
1821 =item Conformance Test
1822
1823 The Conformance Test for the UCA is available
1824 under L<http://www.unicode.org/Public/UCA/>.
1825
1826 For F<CollationTest_SHIFTED.txt>,
1827 a collator via C<Unicode::Collate-E<gt>new( )> should be used;
1828 for F<CollationTest_NON_IGNORABLE.txt>, a collator via
1829 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
1830
1831 B<Unicode::Normalize is required to try The Conformance Test.>
1832
1833 =back
1834
1835 =head1 AUTHOR, COPYRIGHT AND LICENSE
1836
1837 The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
1838 <SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2005,
1839 SADAHIRO Tomoyuki. Japan. All rights reserved.
1840
1841 This module is free software; you can redistribute it and/or
1842 modify it under the same terms as Perl itself.
1843
1844 The file Unicode/Collate/allkeys.txt was copied directly
1845 from L<http://www.unicode.org/Public/UCA/4.1.0/allkeys.txt>.
1846 This file is Copyright (c) 1991-2005 Unicode, Inc. All rights reserved.
1847 Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
1848
1849 =head1 SEE ALSO
1850
1851 =over 4
1852
1853 =item Unicode Collation Algorithm - UTS #10
1854
1855 L<http://www.unicode.org/reports/tr10/>
1856
1857 =item The Default Unicode Collation Element Table (DUCET)
1858
1859 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
1860
1861 =item The conformance test for the UCA
1862
1863 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
1864
1865 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
1866
1867 =item Hangul Syllable Type
1868
1869 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
1870
1871 =item Unicode Normalization Forms - UAX #15
1872
1873 L<http://www.unicode.org/reports/tr15/>
1874
1875 =back
1876
1877 =cut