X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/ae6aa56258ddae4c1d0bf3bf7356f260a4cd9744..06c8fc8f09dc8f7e52006b1a902e84e1587b786f:/lib/Unicode/Collate.pm diff --git a/lib/Unicode/Collate.pm b/lib/Unicode/Collate.pm index 4be5aab..18ed446 100644 --- a/lib/Unicode/Collate.pm +++ b/lib/Unicode/Collate.pm @@ -14,7 +14,7 @@ use File::Spec; require Exporter; -our $VERSION = '0.25'; +our $VERSION = '0.28'; our $PACKAGE = __PACKAGE__; our @ISA = qw(Exporter); @@ -53,7 +53,7 @@ use constant NOMATCHPOS => -1; # A coderef to get combining class imported from Unicode::Normalize # (i.e. \&Unicode::Normalize::getCombinClass). # This is also used as a HAS_UNICODE_NORMALIZE flag. -our $getCombinClass; +our $CVgetCombinClass; # Supported Levels use constant MinLevel => 1; @@ -225,17 +225,17 @@ sub checkCollator { croak "Unicode/Normalize.pm is required to normalize strings: $@" if $@; - Unicode::Normalize->import(); - $getCombinClass = \&Unicode::Normalize::getCombinClass - if ! $getCombinClass; - - $self->{normCode} = - $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC : - $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD : - $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC : - $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD : - croak "$PACKAGE unknown normalization form name: " - . $self->{normalization}; + $CVgetCombinClass = \&Unicode::Normalize::getCombinClass + if ! $CVgetCombinClass; + + if ($self->{normalization} ne 'prenormalized') { + my $norm = $self->{normalization}; + $self->{normCode} = sub { + Unicode::Normalize::normalize($norm, shift); + }; + eval { $self->{normCode}->("") }; # try + $@ and croak "$PACKAGE unknown normalization form name: $norm"; + } } return; } @@ -260,7 +260,7 @@ sub new if ! exists $self->{overrideHangul}; $self->{overrideCJK} = '' if ! exists $self->{overrideCJK}; - $self->{normalization} = 'D' + $self->{normalization} = 'NFD' if ! exists $self->{normalization}; $self->{alternate} = $self->{alternateTable} || 'shifted' if ! exists $self->{alternate}; @@ -350,16 +350,18 @@ sub parseEntry $k = '[.0000.0000.0000.0000]' if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; - my $is_L3_ignorable; + my $is_L3_ignorable = TRUE; foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. my @wt = _getHexArray($arr); push @key, pack(VCE_TEMPLATE, $var, @wt); - $is_L3_ignorable = TRUE - if $wt[0] + $wt[1] + $wt[2] == 0; + $is_L3_ignorable = FALSE + if $wt[0] + $wt[1] + $wt[2] != 0; # if $arr !~ /[1-9A-Fa-f]/; NG # Conformance Test shows L3-ignorable is completely ignorable. + # For expansion, an entry $is_L3_ignorable + # if and only if "all" CEs are [.0000.0000.0000]. } $self->{entries}{$entry} = \@key; @@ -475,28 +477,43 @@ sub splitCE if ($max->{$ce}) { # contract my $temp_ce = $ce; + my $ceLen = 1; + my $maxLen = $max->{$ce}; - for (my $p = $i + 1; $p < @src; $p++) { + for (my $p = $i + 1; $ceLen < $maxLen && $p < @src; $p++) { next if ! defined $src[$p]; $temp_ce .= CODE_SEP . $src[$p]; + $ceLen++; if ($ent->{$temp_ce}) { $ce = $temp_ce; $i = $p; } } - } - # with Combining Char (UTS#10, 4.2.1). - # requires Unicode::Normalize. - # Not be $wLen, as not croaked due to $norm. - if ($getCombinClass) { - for (my $p = $i + 1; $p < @src; $p++) { - next if ! defined $src[$p]; - last unless $getCombinClass->($src[$p]); - my $tail = CODE_SEP . $src[$p]; - if ($ent->{$ce.$tail}) { - $ce .= $tail; - $src[$p] = undef; + # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1). + # This process requires Unicode::Normalize. + # If "normalize" is undef, here should be skipped *always* + # (in spite of bool value of $CVgetCombinClass), + # since canonical ordering cannot be expected. + # Blocked combining character should not be contracted. + + if ($self->{normalization}) + # $self->{normCode} is false in the case of "prenormalized". + { + my $preCC = 0; + my $curCC = 0; + + for (my $p = $i + 1; $p < @src; $p++) { + next if ! defined $src[$p]; + $curCC = $CVgetCombinClass->($src[$p]); + last unless $curCC; + my $tail = CODE_SEP . $src[$p]; + if ($preCC != $curCC && $ent->{$ce.$tail}) { + $ce .= $tail; + $src[$p] = undef; + } else { + $preCC = $curCC; + } } } } @@ -522,8 +539,6 @@ sub getWt my $self = shift; my $ce = shift; my $ent = $self->{entries}; - my $cjk = $self->{overrideCJK}; - my $hang = $self->{overrideHangul}; my $der = $self->{derivCode}; return if !defined $ce; @@ -534,18 +549,50 @@ sub getWt my $u = $ce; if (0xAC00 <= $u && $u <= 0xD7A3) { # is Hangul Syllale - return map $self->altCE($_), - $hang - ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u)) - : defined $hang - ? map({ - $ent->{$_} ? @{ $ent->{$_} } : $der->($_); - } _decompHangul($u)) - : $der->($u); + my $hang = $self->{overrideHangul}; + my @hangulCE; + if ($hang) { + @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u)); + } + elsif (!defined $hang) { + @hangulCE = $der->($u); + } + else { + my $max = $self->{maxlength}; + my @decH = _decompHangul($u); + + if (@decH == 2) { + my $contract = join(CODE_SEP, @decH); + @decH = ($contract) if $ent->{$contract}; + } else { # must be <@decH == 3> + if ($max->{$decH[0]}) { + my $contract = join(CODE_SEP, @decH); + if ($ent->{$contract}) { + @decH = ($contract); + } else { + $contract = join(CODE_SEP, @decH[0,1]); + $ent->{$contract} and @decH = ($contract, $decH[2]); + } + # even if V's ignorable, LT contraction is not supported. + # If such a situatution were required, NFD should be used. + } + if (@decH == 3 && $max->{$decH[1]}) { + my $contract = join(CODE_SEP, @decH[1,2]); + $ent->{$contract} and @decH = ($decH[0], $contract); + } + } + + @hangulCE = map({ + $ent->{$_} ? @{ $ent->{$_} } : $der->($_); + } @decH); + } + return map $self->altCE($_), @hangulCE; } elsif (0x3400 <= $u && $u <= 0x4DB5 || 0x4E00 <= $u && $u <= 0x9FA5 || - 0x20000 <= $u && $u <= 0x2A6D6) { # CJK Ideograph + 0x20000 <= $u && $u <= 0x2A6D6) # CJK Ideograph + { + my $cjk = $self->{overrideCJK}; return map $self->altCE($_), $cjk ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u)) @@ -1090,20 +1137,39 @@ If omitted, the maximum is the 4th. If specified, strings are normalized before preparation of sort keys (the normalization is executed after preprocess). -As a form name, one of the following names must be used. +A form name C accepts will be applied +as C<$normalization_form>. +Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. +See C for detail. +If omitted, C<'NFD'> is used. - 'C' or 'NFC' for Normalization Form C - 'D' or 'NFD' for Normalization Form D - 'KC' or 'NFKC' for Normalization Form KC - 'KD' or 'NFKD' for Normalization Form KD +L is performed after L (if defined). -If omitted, the string is put into Normalization Form D. +Furthermore, special values, C and C<"prenormalized">, can be used, +though they are not concerned with C. -If C is passed explicitly as the value for this key, +If C (not a string C<"undef">) is passed explicitly +as the value for this key, any normalization is not carried out (this may make tailoring easier if any normalization is not desired). - -see B. +Under C<(normalization =E undef)>, only contiguous contractions +are resolved; e.g. C would be primary equal to C, +even if C (and C) is ordered after C. +In this point, +C<(normalization =E undef, preprocess =E sub { NFD(shift) })> +B equivalent to C<(normalization =E 'NFD')>. + +In the case of C<(normalization =E "prenormalized")>, +any normalization is not performed, but +non-contiguous contractions with combining characters are performed. +Therefore +C<(normalization =E 'prenormalized', preprocess =E sub { NFD(shift) })> +B equivalent to C<(normalization =E 'NFD')>. +If source strings are finely prenormalized, +C<(normalization =E 'prenormalized')> may save time for normalization. + +Except C<(normalization =E undef)>, +B is required (see also B). =item overrideCJK @@ -1167,9 +1233,11 @@ Then, "the pen" is before "a pencil". preprocess => sub { my $str = shift; $str =~ s/\b(?:an?|the)\s+//gi; - $str; + return $str; }, +L is performed before L (if defined). + =item rearrange -- see 3.1.3 Rearrangement, UTS #10. @@ -1503,7 +1571,7 @@ B =head1 AUTHOR -SADAHIRO Tomoyuki, ESADAHIRO@cpan.orgE +SADAHIRO Tomoyuki, http://homepage1.nifty.com/nomenclator/perl/