no warnings 'utf8';
-our $VERSION = '0.67';
+our $VERSION = '0.91';
our $PACKAGE = __PACKAGE__;
+### begin XS only ###
+require DynaLoader;
+our @ISA = qw(DynaLoader);
+bootstrap Unicode::Collate $VERSION;
+### end XS only ###
+
my @Path = qw(Unicode Collate);
my $KeyFile = "allkeys.txt";
# This character must not be included in any stringified
# representation of an integer.
use constant CODE_SEP => ';';
+ # NOTE: in regex /;/ is used for $jcps!
# boolean values of variable weights
use constant NON_VAR => 0; # Non-Variable character
use constant VAR => 1; # Variable character
# specific code points
-use constant Hangul_SBase => 0xAC00;
use constant Hangul_SIni => 0xAC00;
use constant Hangul_SFin => 0xD7A3;
-use constant Hangul_NCount => 588;
-use constant Hangul_TCount => 28;
-use constant Hangul_LBase => 0x1100;
-use constant Hangul_LIni => 0x1100;
-use constant Hangul_LFin => 0x1159;
-use constant Hangul_LFill => 0x115F;
-use constant Hangul_LEnd => 0x115F; # Unicode 5.2
-use constant Hangul_VBase => 0x1161;
-use constant Hangul_VIni => 0x1160; # from Vowel Filler
-use constant Hangul_VFin => 0x11A2;
-use constant Hangul_VEnd => 0x11A7; # Unicode 5.2
-use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint
-use constant Hangul_TIni => 0x11A8;
-use constant Hangul_TFin => 0x11F9;
-use constant Hangul_TEnd => 0x11FF; # Unicode 5.2
-use constant HangulL2Ini => 0xA960; # Unicode 5.2
-use constant HangulL2Fin => 0xA97C; # Unicode 5.2
-use constant HangulV2Ini => 0xD7B0; # Unicode 5.2
-use constant HangulV2Fin => 0xD7C6; # Unicode 5.2
-use constant HangulT2Ini => 0xD7CB; # Unicode 5.2
-use constant HangulT2Fin => 0xD7FB; # Unicode 5.2
-
-use constant CJK_UidIni => 0x4E00;
-use constant CJK_UidFin => 0x9FA5;
-use constant CJK_UidF41 => 0x9FBB;
-use constant CJK_UidF51 => 0x9FC3;
-use constant CJK_UidF52 => 0x9FCB;
-use constant CJK_ExtAIni => 0x3400; # Unicode 3.0
-use constant CJK_ExtAFin => 0x4DB5; # Unicode 3.0
-use constant CJK_ExtBIni => 0x20000; # Unicode 3.1
-use constant CJK_ExtBFin => 0x2A6D6; # Unicode 3.1
-use constant CJK_ExtCIni => 0x2A700; # Unicode 5.2
-use constant CJK_ExtCFin => 0x2B734; # Unicode 5.2
-use constant CJK_ExtDIni => 0x2B740; # Unicode 6.0
-use constant CJK_ExtDFin => 0x2B81D; # Unicode 6.0
-
-my %CompatUI = map +($_ => 1), (
- 0xFA0E, 0xFA0F, 0xFA11, 0xFA13, 0xFA14, 0xFA1F,
- 0xFA21, 0xFA23, 0xFA24, 0xFA27, 0xFA28, 0xFA29,
-);
# Logical_Order_Exception in PropList.txt
my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
-sub UCA_Version { "20" }
+sub UCA_Version { "24" }
-sub Base_Unicode_Version { "5.2.0" }
+sub Base_Unicode_Version { "6.1.0" }
######
return pack('U*', @_);
}
-sub unpack_U {
- return unpack('U*', shift(@_).pack('U*'));
-}
-
######
my (%VariableOK);
our @ChangeOK = qw/
alternate backwards level normalization rearrange
- katakana_before_hiragana upper_before_lower
+ katakana_before_hiragana upper_before_lower ignore_level2
overrideHangul overrideCJK preprocess UCA_Version
hangul_terminator variable
/;
our @ChangeNG = qw/
entry mapping table maxlength contraction
- ignoreChar ignoreName undefChar undefName variableTable
- versionTable alternateTable backwardsTable forwardsTable rearrangeTable
+ ignoreChar ignoreName undefChar undefName rewrite
+ versionTable alternateTable backwardsTable forwardsTable
+ rearrangeTable variableTable
derivCode normCode rearrangeHash backwardsFlag
suppress suppressHash
- /;
+ __useXS /; ### XS only
# The hash key 'ignored' is deleted at v 0.21.
# The hash key 'isShift' is deleted at v 0.23.
# The hash key 'combining' is deleted at v 0.24.
18 => \&_derivCE_18,
20 => \&_derivCE_20,
22 => \&_derivCE_22,
+ 24 => \&_derivCE_24,
);
sub checkCollator {
my $class = shift;
my $self = bless { @_ }, $class;
+### begin XS only ###
+ if (! exists $self->{table} && !defined $self->{rewrite} &&
+ !defined $self->{undefName} && !defined $self->{ignoreName} &&
+ !defined $self->{undefChar} && !defined $self->{ignoreChar}) {
+ $self->{__useXS} = \&_fetch_simple;
+ } else {
+ $self->{__useXS} = undef;
+ }
+### end XS only ###
+
# keys of $self->{suppressHash} are $self->{suppress}.
if ($self->{suppress} && @{ $self->{suppress} }) {
@{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
if ($self->{entry}) {
while ($self->{entry} =~ /([^\n]+)/g) {
- $self->parseEntry($1);
+ $self->parseEntry($1, TRUE);
}
}
sub read_table {
my $self = shift;
+### begin XS only ###
+ if ($self->{__useXS}) {
+ my @rest = _fetch_rest(); # complex matter need to parse
+ for my $line (@rest) {
+ next if $line =~ /^\s*#/;
+
+ if ($line =~ s/^\s*\@//) {
+ $self->parseAtmark($line);
+ } else {
+ $self->parseEntry($line);
+ }
+ }
+ return;
+ }
+### end XS only ###
+
my($f, $fh);
foreach my $d (@INC) {
$f = File::Spec->catfile($d, @Path, $self->{table});
{
my $self = shift;
my $line = shift;
+ my $tailoring = shift;
my($name, $entry, @uv, @key);
+ if (defined $self->{rewrite}) {
+ $line = $self->{rewrite}->($line);
+ }
+
return if $line !~ /^\s*[0-9A-Fa-f]/;
# removes comment and gets name
@uv = _getHexArray($e);
return if !@uv;
- return if @uv > 1 && $self->{suppressHash} &&
+ return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
exists $self->{suppressHash}{$uv[0]};
$entry = join(CODE_SEP, @uv); # in JCPS
}
-##
-## VCE = _varCE(variable, VCE)
-##
-sub _varCE
-{
- my $vbl = shift;
- my $vce = shift;
- if ($vbl eq 'non-ignorable') {
- return $vce;
- }
- my ($var, @wt) = unpack VCE_TEMPLATE, $vce;
-
- if ($var) {
- return pack(VCE_TEMPLATE, $var, 0, 0, 0,
- $vbl eq 'blanked' ? $wt[3] : $wt[0]);
- }
- elsif ($vbl eq 'blanked') {
- return $vce;
- }
- else {
- return pack(VCE_TEMPLATE, $var, @wt[0..2],
- $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0);
- }
-}
-
sub viewSortKey
{
my $self = shift;
$self->visualizeSortKey($self->getSortKey(@_));
}
-sub visualizeSortKey
-{
- my $self = shift;
- my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift);
-
- if ($self->{UCA_Version} <= 8) {
- $view =~ s/ ?0000 ?/|/g;
- } else {
- $view =~ s/\b0000\b/|/g;
- }
- return "[$view]";
-}
-
##
## arrayref of JCPS = splitEnt(string to be collated)
my $reH = $self->{rearrangeHash};
my $vers = $self->{UCA_Version};
my $ver9 = $vers >= 9 && $vers <= 11;
+ my $uXS = $self->{__useXS}; ### XS only
my ($str, @buf);
} elsif ($ver9) {
$src[$i] = undef if $map->{ $src[$i] } &&
@{ $map->{ $src[$i] } } == 0;
+### begin XS only ###
+ if ($uXS) {
+ $src[$i] = undef if _ignorable_simple($src[$i]);
+ }
+### end XS only ###
}
}
}
# skip completely ignorable
- if ($map->{$jcps} && @{ $map->{$jcps} } == 0) {
+ if ($uXS && $jcps !~ /;/ && _ignorable_simple($jcps) || ### XS only
+ $map->{$jcps} && @{ $map->{$jcps} } == 0) {
if ($wLen && @buf) {
$buf[-1][2] = $i + 1;
}
{
my $self = shift;
my $u = shift;
- my $vbl = $self->{variable};
my $map = $self->{mapping};
my $der = $self->{derivCode};
+ my $uXS = $self->{__useXS}; ### XS only
return if !defined $u;
- return map(_varCE($vbl, $_), @{ $map->{$u} })
+ return map($self->varCE($_), @{ $map->{$u} })
if $map->{$u};
+### begin XS only ###
+ return map($self->varCE($_), _fetch_simple($u))
+ if $uXS && _exists_simple($u);
+### end XS only ###
# JCPS must not be a contraction, then it's a code point.
if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
$map->{$contract} and @decH = ($contract, $decH[2]);
}
# even if V's ignorable, LT contraction is not supported.
- # If such a situatution were required, NFD should be used.
+ # If such a situation were required, NFD should be used.
}
if (@decH == 3 && $max->{$decH[1]}) {
my $contract = join(CODE_SEP, @decH[1,2]);
}
@hangulCE = map({
- $map->{$_} ? @{ $map->{$_} } : $der->($_);
+ $map->{$_} ? @{ $map->{$_} } :
+ $uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
+ $der->($_);
} @decH);
}
- return map _varCE($vbl, $_), @hangulCE;
+ return map $self->varCE($_), @hangulCE;
} else {
my $cjk = $self->{overrideCJK};
my $vers = $self->{UCA_Version};
if ($cjk && _isUIdeo($u, $vers)) {
my @cjkCE = map _pack_override($_, $u, $der), $cjk->($u);
- return map _varCE($vbl, $_), @cjkCE;
+ return map $self->varCE($_), @cjkCE;
}
if ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
- return map _varCE($vbl, $_), _uideoCE_8($u);
+ return map $self->varCE($_), _uideoCE_8($u);
}
- return map _varCE($vbl, $_), $der->($u);
+ return map $self->varCE($_), $der->($u);
}
}
sub getSortKey
{
my $self = shift;
- my $lev = $self->{level};
my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS
my $vers = $self->{UCA_Version};
- my $vbl = $self->{variable};
my $term = $self->{hangul_terminator};
- my $v2i = $vers >= 9 && $vbl ne 'non-ignorable';
my @buf; # weight arrays
if ($term) {
my $preHST = '';
- my $termCE = _varCE($vbl, pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
+ my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
foreach my $jcps (@$rEnt) {
# weird things like VL, TL-contraction are not considered!
my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
}
}
- # make sort key
- my @ret = ([],[],[],[]);
- my $last_is_variable;
-
- foreach my $vwt (@buf) {
- my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
-
- # "Ignorable (L1, L2) after Variable" since track. v. 9
- if ($v2i) {
- if ($var) {
- $last_is_variable = TRUE;
- } elsif (!$wt[0]) { # ignorable
- next if $last_is_variable;
- } else {
- $last_is_variable = FALSE;
- }
- }
- foreach my $v (0..$lev-1) {
- 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v];
- }
- }
-
- # modification of tertiary weights
- if ($self->{upper_before_lower}) {
- foreach my $w (@{ $ret[2] }) {
- if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower
- elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper
- elsif ($w == 0x1C) { $w += 1 } # square upper
- elsif ($w == 0x1D) { $w -= 1 } # square lower
- }
- }
- if ($self->{katakana_before_hiragana}) {
- foreach my $w (@{ $ret[2] }) {
- if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana
- elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana
- }
- }
-
- if ($self->{backwardsFlag}) {
- for (my $v = MinLevel; $v <= MaxLevel; $v++) {
- if ($self->{backwardsFlag} & (1 << $v)) {
- @{ $ret[$v-1] } = reverse @{ $ret[$v-1] };
- }
- }
- }
-
- join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret;
+ return $self->mk_SortKey(\@buf); ### XS only
}
}
-sub _derivCE_22 {
- my $u = shift;
- my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u})
- ? 0xFB40 : # CJK
- (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
- CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
- CJK_ExtCIni <= $u && $u <= CJK_ExtCFin ||
- CJK_ExtDIni <= $u && $u <= CJK_ExtDFin)
- ? 0xFB80 # CJK ext.
- : 0xFBC0; # others
- my $aaaa = $base + ($u >> 15);
- my $bbbb = ($u & 0x7FFF) | 0x8000;
- return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
- pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
-}
-
-sub _derivCE_20 {
- my $u = shift;
- my $base = (CJK_UidIni <= $u && $u <= CJK_UidF52 || $CompatUI{$u})
- ? 0xFB40 : # CJK
- (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
- CJK_ExtBIni <= $u && $u <= CJK_ExtBFin ||
- CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
- ? 0xFB80 # CJK ext.
- : 0xFBC0; # others
- my $aaaa = $base + ($u >> 15);
- my $bbbb = ($u & 0x7FFF) | 0x8000;
- return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
- pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
-}
-
-sub _derivCE_18 {
- my $u = shift;
- my $base = (CJK_UidIni <= $u && $u <= CJK_UidF51 || $CompatUI{$u})
- ? 0xFB40 : # CJK
- (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
- CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
- ? 0xFB80 # CJK ext.
- : 0xFBC0; # others
- my $aaaa = $base + ($u >> 15);
- my $bbbb = ($u & 0x7FFF) | 0x8000;
- return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
- pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
-}
-
-sub _derivCE_14 {
- my $u = shift;
- my $base = (CJK_UidIni <= $u && $u <= CJK_UidF41 || $CompatUI{$u})
- ? 0xFB40 : # CJK
- (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
- CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
- ? 0xFB80 # CJK ext.
- : 0xFBC0; # others
- my $aaaa = $base + ($u >> 15);
- my $bbbb = ($u & 0x7FFF) | 0x8000;
- return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
- pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
-}
-
-sub _derivCE_9 {
- my $u = shift;
- my $base = (CJK_UidIni <= $u && $u <= CJK_UidFin || $CompatUI{$u})
- ? 0xFB40 : # CJK
- (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin ||
- CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
- ? 0xFB80 # CJK ext.
- : 0xFBC0; # others
- my $aaaa = $base + ($u >> 15);
- my $bbbb = ($u & 0x7FFF) | 0x8000;
- return pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u),
- pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u);
-}
-
-sub _derivCE_8 {
- my $code = shift;
- my $aaaa = 0xFF80 + ($code >> 15);
- my $bbbb = ($code & 0x7FFF) | 0x8000;
- return pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code),
- pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code);
-}
-
-sub _uideoCE_8 {
- my $u = shift;
- return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u);
-}
-
-sub _isUIdeo {
- # $uca_vers = 0 for _uideoCE_8()
- my ($u, $uca_vers) = @_;
- return((CJK_UidIni <= $u && (
- $uca_vers >= 20 ? ($u <= CJK_UidF52) :
- $uca_vers >= 18 ? ($u <= CJK_UidF51) :
- $uca_vers >= 14 ? ($u <= CJK_UidF41) :
- ($u <= CJK_UidFin))) || $CompatUI{$u}
- ||
- (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin)
- ||
- ($uca_vers >= 8 && CJK_ExtBIni <= $u && $u <= CJK_ExtBFin)
- ||
- ($uca_vers >= 20 && CJK_ExtCIni <= $u && $u <= CJK_ExtCFin)
- ||
- ($uca_vers >= 22 && CJK_ExtDIni <= $u && $u <= CJK_ExtDFin)
- );
-}
-
-
-##
-## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
-##
-sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g }
-
-#
-# $code *must* be in Hangul syllable.
-# Check it before you enter here.
-#
-sub _decompHangul {
- my $code = shift;
- my $si = $code - Hangul_SBase;
- my $li = int( $si / Hangul_NCount);
- my $vi = int(($si % Hangul_NCount) / Hangul_TCount);
- my $ti = $si % Hangul_TCount;
- return (
- Hangul_LBase + $li,
- Hangul_VBase + $vi,
- $ti ? (Hangul_TBase + $ti) : (),
- );
-}
-
-sub _isIllegal {
- my $code = shift;
- return((! defined $code) # removed
- || ($code < 0 || 0x10FFFF < $code) # out of range
- );
-}
-
-sub _isNonchar {
- my $code = shift;
- return((($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c)
- || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates
- || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters
- );
-}
-
-# Hangul Syllable Type
-sub getHST {
- my $u = shift;
- my $vers = shift || 0;
-
- if (Hangul_SIni <= $u && $u <= Hangul_SFin) {
- return +($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV";
- }
-
- if ($vers < 20) {
- return Hangul_LIni <= $u && $u <= Hangul_LFin ||
- $u == Hangul_LFill ? "L" :
- Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" :
- Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" : "";
- } else {
- return Hangul_LIni <= $u && $u <= Hangul_LEnd ||
- HangulL2Ini <= $u && $u <= HangulL2Fin ? "L" :
- Hangul_VIni <= $u && $u <= Hangul_VEnd ||
- HangulV2Ini <= $u && $u <= HangulV2Fin ? "V" :
- Hangul_TIni <= $u && $u <= Hangul_TEnd ||
- HangulT2Ini <= $u && $u <= HangulT2Fin ? "T" : "";
- }
-}
-
-
##
## bool _nonIgnorAtLevel(arrayref weights, int level)
##
my $lev = shift;
for my $g (0..@$substr-1){
- # Do the $g'th graphemes have the same number of AV weigths?
+ # Do the $g'th graphemes have the same number of AV weights?
return if @{ $source->[$g] } != @{ $substr->[$g] };
for my $w (0..@{ $substr->[$g] }-1) {
##
## (int position, int length)
-## int position = index(string, substring, position, [undoc'ed grobal])
+## int position = index(string, substring, position, [undoc'ed global])
##
-## With "grobal" (only for the list context),
+## With "global" (only for the list context),
## returns list of arrayref[position, length].
##
sub index
my $subE = $self->splitEnt(shift);
my $pos = @_ ? shift : 0;
$pos = 0 if $pos < 0;
- my $grob = shift;
+ my $glob = shift;
my $lev = $self->{level};
my $v2i = $self->{UCA_Version} >= 9 &&
if (! @$subE) {
my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
- return $grob
+ return $glob
? map([$_, 0], $temp..$len)
: wantarray ? ($temp,0) : $temp;
}
_eqArray(\@strWt, \@subWt, $lev)) {
my $temp = $iniPos[0] + $pos;
- if ($grob) {
+ if ($glob) {
push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
splice @strWt, 0, $#subWt;
splice @iniPos, 0, $#subWt;
}
}
- return $grob
+ return $glob
? @g_ret
: wantarray ? () : NOMATCHPOS;
}
hangul_terminator => $term_primary_weight,
ignoreName => qr/$ignoreName/,
ignoreChar => qr/$ignoreChar/,
+ ignore_level2 => $bool,
katakana_before_hiragana => $bool,
level => $collationLevel,
normalization => $normalization_form,
overrideHangul => \&overrideHangul,
preprocess => \&preprocess,
rearrange => \@charList,
+ rewrite => \&rewrite,
suppress => \@charList,
table => $filename,
undefName => qr/$undefName/,
=item UCA_Version
-If the tracking version number of UCA is given,
-behavior of that tracking version is emulated on collating.
+If the revision (previously "tracking version") number of UCA is given,
+behavior of that revision is emulated on collating.
If omitted, the return value of C<UCA_Version()> is used.
-The following tracking versions are supported. The default is 20.
+The following revisions are supported. The default is 24.
UCA Unicode Standard DUCET (@version)
-------------------------------------------------------
18 5.1.0 5.1.0 (5.1.0)
20 5.2.0 5.2.0 (5.2.0)
22 6.0.0 6.0.0 (6.0.0)
+ 24 6.1.0 6.1.0 (6.1.0)
-Note: Recent UTS #10 renames "Tracking Version" to "Revision."
-
-* Noncharacters (e.g. U+FFFF) are not ignored, and can be overrided
+* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
since C<UCA_Version> 22.
* Fully ignorable characters were ignored, and would not interrupt
backwards => $levelNumber or \@levelNumbers
Weights in reverse order; ex. level 2 (diacritic ordering) in French.
-If omitted, forwards at all the levels.
+If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
+forwards at all the levels.
=item entry
If the same character (or a sequence of characters) exists
in the collation element table through C<table>,
-mapping to collation elements is overrided.
+mapping to collation elements is overridden.
If it does not exist, the mapping is defined additionally.
entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
E.g. when 'a' and 'e' are ignorable,
'element' is equal to 'lament' (or 'lmnt').
+=item ignore_level2
+
+-- see 5.1 Parametric Tailoring, UTS #10.
+
+By default, case-sensitive comparison (that is level 3 difference)
+won't ignore accents (that is level 2 difference).
+
+If the parameter is made true, accents (and other primary ignorable
+characters) are ignored, even though cases are taken into account.
+
+B<NOTE>: C<level> should be 3 or greater.
+
=item katakana_before_hiragana
-- see 7.3.1 Tertiary Weight Table, UTS #10.
those in the CJK Unified Ideographs Extension A etc.
In the CJK Unified Ideographs block:
- U+4E00..U+9FA5 if UCA_Version is 8 to 11.
- U+4E00..U+9FBB if UCA_Version is 14 to 16.
+ U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
+ U+4E00..U+9FBB if UCA_Version is 14 or 16.
U+4E00..U+9FC3 if UCA_Version is 18.
- U+4E00..U+9FCB if UCA_Version is 20 or greater.
+ U+4E00..U+9FCB if UCA_Version is 20 or 22.
+ U+4E00..U+9FCC if UCA_Version is 24.
In the CJK Unified Ideographs Extension blocks:
Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or greater.
Through C<overrideCJK>, ordering of CJK unified ideographs (including
-extensions) can be overrided.
+extensions) can be overridden.
ex. CJK unified ideographs in the JIS code point order.
B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
-ideographs. But they can't be overrided via C<overrideCJK> when you use
+ideographs. But they can't be overridden via C<overrideCJK> when you use
DUCET, as the table includes weights for them. C<table> or C<entry> has
priority over C<overrideCJK>.
By default, Hangul syllables are decomposed into Hangul Jamo,
even if C<(normalization =E<gt> undef)>.
-But the mapping of Hangul syllables may be overrided.
+But the mapping of Hangul syllables may be overridden.
This parameter works like C<overrideCJK>, so see there for examples.
-- see 5.1 Preprocessing, UTS #10.
-If specified, the coderef is used to preprocess
+If specified, the coderef is used to preprocess each string
before the formation of sort keys.
ex. dropping English articles, such as "a" or "the".
B<According to the version 9 of UCA, this parameter shall not be used;
but it is not warned at present.>
+=item rewrite
+
+If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
+The coderef will get each line, and then should return a rewritten line
+according to the UCA file format.
+If the coderef returns an empty line, the line will be skipped.
+
+e.g. any primary ignorable characters into tertiary ignorable:
+
+ rewrite => sub {
+ my $line = shift;
+ $line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
+ return $line;
+ },
+
+This example shows rewriting weights. C<rewrite> is allowed to
+affect code points, weights, and the name.
+
+B<NOTE>: C<table> is available to use another table file;
+preparing a modified table once would be more efficient than
+rewriting lines on reading an unmodified table every time.
+
=item suppress
-- see suppress contractions in 5.14.11 Special-Purpose Commands,
UTS #35 (LDML).
Contractions beginning with the specified characters are suppressed,
-even if those contractions are defined in C<table> or C<entry>.
+even if those contractions are defined in C<table>.
An example for Russian and some languages using the Cyrillic script:
where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
+B<NOTE>: Contractions via C<entry> are not be suppressed.
+
=item table
-- see 3.2 Default Unicode Collation Element Table, UTS #10.
B<NOTE>: When XSUB is used, the DUCET is compiled on building this
module, and it may save time at the run time.
Explicit saying C<table =E<gt> 'allkeys.txt'> (or using another table),
-or using C<ignoreChar>, C<ignoreName>, C<undefChar>, or C<undefName>
-will prevent this module from using the compiled DUCET.
+or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
+C<rewrite> will prevent this module from using the compiled DUCET.
If C<undef> is passed explicitly as the value for this key,
no file is read (but you can define collation elements via C<entry>).
-- see 3.2.2 Variable Weighting, UTS #10.
-This key allows to variable weighting for variable collation elements,
+This key allows for variable weighting of variable collation elements,
which are marked with an ASTERISK in the table
-(NOTE: Many punction marks and symbols are variable in F<allkeys.txt>).
+(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
=head2 Methods for Searching
-B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
-for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
-C<subst>, C<gsubst>) is croaked,
-as the position and the length might differ
-from those on the specified string.
-(And C<rearrange> and C<hangul_terminator> parameters are neglected.)
-
The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
but they are not aware of any pattern, but only a literal substring.
+B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
+for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
+C<subst>, C<gsubst>) is croaked, as the position and the length might
+differ from those on the specified string.
+
+C<rearrange> and C<hangul_terminator> parameters are neglected.
+C<katakana_before_hiragana> and C<upper_before_lower> don't affect
+matching and searching, as it doesn't matter whether greater or lesser.
+
=over 4
=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
If C<$substring> matches a part of C<$string>,
the first occurrence of the matching part is replaced by C<$replacement>
-(C<$string> is modified) and return C<$count> (always equals to C<1>).
+(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
C<$replacement> can be a C<CODEREF>,
taking the matching part as an argument,
=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
If C<$substring> matches a part of C<$string>,
-all the occurrences of the matching part is replaced by C<$replacement>
-(C<$string> is modified) and return C<$count>.
+all the occurrences of the matching part are replaced by C<$replacement>
+(C<$string> is modified) and C<$count> is returned.
C<$replacement> can be a C<CODEREF>,
taking the matching part as an argument,
my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
# (normalization => undef) is REQUIRED.
- my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L...";
+ my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
$Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
- # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>...";
+ # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
# i.e., all the camels are made bold-faced.
+ Examples: levels and ignore_level2 - what does camel match?
+ ---------------------------------------------------------------------------
+ level ignore_level2 | camel Camel came\x{301}l c-a-m-e-l cam\0e\0l
+ -----------------------|---------------------------------------------------
+ 1 false | yes yes yes yes yes
+ 2 false | yes yes no yes yes
+ 3 false | yes no no yes yes
+ 4 false | yes no no no yes
+ -----------------------|---------------------------------------------------
+ 1 true | yes yes yes yes yes
+ 2 true | yes yes yes yes yes
+ 3 true | yes no yes yes yes
+ 4 true | yes no yes no yes
+ ---------------------------------------------------------------------------
+ note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
+ at any level.
+
=back
=head2 Other Methods
=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
-Change the value of specified keys and returns the changed part.
+=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
+
+Changes the value of specified keys and returns the changed part.
$Collator = Unicode::Collate->new(level => 4);
=item C<UCA_Version()>
-Returns the tracking version number of UTS #10 this module consults.
-C<UCA_Version()> should return the tracking version corresponding
-with the DUCET incorporated.
+Returns the revision number of UTS #10 this module consults,
+that should correspond with the DUCET incorporated.
=item C<Base_Unicode_Version()>
-Returns the version number of UTS #10 this module consults.
+Returns the version number of UTS #10 this module consults,
+that should correspond with the DUCET incorporated.
=back
=head1 AUTHOR, COPYRIGHT AND LICENSE
The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
-<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2010,
+<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2012,
SADAHIRO Tomoyuki. Japan. All rights reserved.
This module is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
The file Unicode/Collate/allkeys.txt was copied verbatim
-from L<http://www.unicode.org/Public/UCA/5.2.0/allkeys.txt>.
-This file is Copyright (c) 1991-2009 Unicode, Inc. All rights reserved.
+from L<http://www.unicode.org/Public/UCA/6.1.0/allkeys.txt>.
+For this file, Copyright (c) 2001-2011 Unicode, Inc.
Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>.
=head1 SEE ALSO