perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	package Unicode::Collate;
	2
	3	BEGIN {
	4	unless ("A" eq pack('U', 0x41)) {
	5	die "Unicode::Collate cannot stringify a Unicode code point\n";
	6	}
	7	unless (0x41 == unpack('U', 'A')) {
	8	die "Unicode::Collate cannot get a Unicode code point\n";
	9	}
	10	}
	11
	12	use 5.006;
	13	use strict;
	14	use warnings;
	15	use Carp;
	16	use File::Spec;
	17
	18	no warnings 'utf8';
	19
	20	our $VERSION = '1.18';
	21	our $PACKAGE = __PACKAGE__;
	22
	23	### begin XS only ###
	24	require DynaLoader;
	25	our @ISA = qw(DynaLoader);
	26	bootstrap Unicode::Collate $VERSION;
	27	### end XS only ###
	28
	29	my @Path = qw(Unicode Collate);
	30	my $KeyFile = "allkeys.txt";
	31
	32	# Perl's boolean
	33	use constant TRUE => 1;
	34	use constant FALSE => "";
	35	use constant NOMATCHPOS => -1;
	36
	37	# A coderef to get combining class imported from Unicode::Normalize
	38	# (i.e. \&Unicode::Normalize::getCombinClass).
	39	# This is also used as a HAS_UNICODE_NORMALIZE flag.
	40	my $CVgetCombinClass;
	41
	42	# Supported Levels
	43	use constant MinLevel => 1;
	44	use constant MaxLevel => 4;
	45
	46	# Minimum weights at level 2 and 3, respectively
	47	use constant Min2Wt => 0x20;
	48	use constant Min3Wt => 0x02;
	49
	50	# Shifted weight at 4th level
	51	use constant Shift4Wt => 0xFFFF;
	52
	53	# A boolean for Variable and 16-bit weights at 4 levels of Collation Element
	54	use constant VCE_TEMPLATE => 'Cn4';
	55
	56	# A sort key: 16-bit weights
	57	use constant KEY_TEMPLATE => 'n*';
	58
	59	# The tie-breaking: 32-bit weights
	60	use constant TIE_TEMPLATE => 'N*';
	61
	62	# Level separator in a sort key:
	63	# i.e. pack(KEY_TEMPLATE, 0)
	64	use constant LEVEL_SEP => "\0\0";
	65
	66	# As Unicode code point separator for hash keys.
	67	# A joined code point string (denoted by JCPS below)
	68	# like "65;768" is used for internal processing
	69	# instead of Perl's Unicode string like "\x41\x{300}",
	70	# as the native code point is different from the Unicode code point
	71	# on EBCDIC platform.
	72	# This character must not be included in any stringified
	73	# representation of an integer.
	74	use constant CODE_SEP => ';';
	75	# NOTE: in regex /;/ is used for $jcps!
	76
	77	# boolean values of variable weights
	78	use constant NON_VAR => 0; # Non-Variable character
	79	use constant VAR => 1; # Variable character
	80
	81	# specific code points
	82	use constant Hangul_SIni => 0xAC00;
	83	use constant Hangul_SFin => 0xD7A3;
	84
	85	# Logical_Order_Exception in PropList.txt
	86	my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ];
	87
	88	# for highestFFFF and minimalFFFE
	89	my $HighestVCE = pack(VCE_TEMPLATE, 0, 0xFFFE, 0x20, 0x5, 0xFFFF);
	90	my $minimalVCE = pack(VCE_TEMPLATE, 0, 1, 0x20, 0x5, 0xFFFE);
	91
	92	sub UCA_Version { "32" }
	93
	94	sub Base_Unicode_Version { "8.0.0" }
	95
	96	######
	97
	98	sub pack_U {
	99	return pack('U*', @_);
	100	}
	101
	102	sub unpack_U {
	103	return unpack('U', shift(@_).pack('U'));
	104	}
	105
	106	######
	107
	108	my (%VariableOK);
	109	@VariableOK{ qw/
	110	blanked non-ignorable shifted shift-trimmed
	111	/ } = (); # keys lowercased
	112
	113	our @ChangeOK = qw/
	114	alternate backwards level normalization rearrange
	115	katakana_before_hiragana upper_before_lower ignore_level2
	116	overrideCJK overrideHangul overrideOut preprocess UCA_Version
	117	hangul_terminator variable identical highestFFFF minimalFFFE
	118	long_contraction
	119	/;
	120
	121	our @ChangeNG = qw/
	122	entry mapping table maxlength contraction
	123	ignoreChar ignoreName undefChar undefName rewrite
	124	versionTable alternateTable backwardsTable forwardsTable
	125	rearrangeTable variableTable
	126	derivCode normCode rearrangeHash backwardsFlag
	127	suppress suppressHash
	128	__useXS /; ### XS only
	129	# The hash key 'ignored' was deleted at v 0.21.
	130	# The hash key 'isShift' was deleted at v 0.23.
	131	# The hash key 'combining' was deleted at v 0.24.
	132	# The hash key 'entries' was deleted at v 0.30.
	133	# The hash key 'L3_ignorable' was deleted at v 0.40.
	134
	135	sub version {
	136	my $self = shift;
	137	return $self->{versionTable} \|\| 'unknown';
	138	}
	139
	140	my (%ChangeOK, %ChangeNG);
	141	@ChangeOK{ @ChangeOK } = ();
	142	@ChangeNG{ @ChangeNG } = ();
	143
	144	sub change {
	145	my $self = shift;
	146	my %hash = @_;
	147	my %old;
	148	if (exists $hash{alternate}) {
	149	if (exists $hash{variable}) {
	150	delete $hash{alternate};
	151	} else {
	152	$hash{variable} = $hash{alternate};
	153	}
	154	}
	155	foreach my $k (keys %hash) {
	156	if (exists $ChangeOK{$k}) {
	157	$old{$k} = $self->{$k};
	158	$self->{$k} = $hash{$k};
	159	} elsif (exists $ChangeNG{$k}) {
	160	croak "change of $k via change() is not allowed!";
	161	}
	162	# else => ignored
	163	}
	164	$self->checkCollator();
	165	return wantarray ? %old : $self;
	166	}
	167
	168	sub _checkLevel {
	169	my $level = shift;
	170	my $key = shift; # 'level' or 'backwards'
	171	MinLevel <= $level or croak sprintf
	172	"Illegal level %d (in value for key '%s') lower than %d.",
	173	$level, $key, MinLevel;
	174	$level <= MaxLevel or croak sprintf
	175	"Unsupported level %d (in value for key '%s') higher than %d.",
	176	$level, $key, MaxLevel;
	177	}
	178
	179	my %DerivCode = (
	180	8 => \&_derivCE_8,
	181	9 => \&_derivCE_9,
	182	11 => \&_derivCE_9, # 11 == 9
	183	14 => \&_derivCE_14,
	184	16 => \&_derivCE_14, # 16 == 14
	185	18 => \&_derivCE_18,
	186	20 => \&_derivCE_20,
	187	22 => \&_derivCE_22,
	188	24 => \&_derivCE_24,
	189	26 => \&_derivCE_24, # 26 == 24
	190	28 => \&_derivCE_24, # 28 == 24
	191	30 => \&_derivCE_24, # 30 == 24
	192	32 => \&_derivCE_32,
	193	);
	194
	195	sub checkCollator {
	196	my $self = shift;
	197	_checkLevel($self->{level}, "level");
	198
	199	$self->{derivCode} = $DerivCode{ $self->{UCA_Version} }
	200	or croak "Illegal UCA version (passed $self->{UCA_Version}).";
	201
	202	$self->{variable} \|\|= $self->{alternate} \|\| $self->{variableTable} \|\|
	203	$self->{alternateTable} \|\| 'shifted';
	204	$self->{variable} = $self->{alternate} = lc($self->{variable});
	205	exists $VariableOK{ $self->{variable} }
	206	or croak "$PACKAGE unknown variable parameter name: $self->{variable}";
	207
	208	if (! defined $self->{backwards}) {
	209	$self->{backwardsFlag} = 0;
	210	} elsif (! ref $self->{backwards}) {
	211	_checkLevel($self->{backwards}, "backwards");
	212	$self->{backwardsFlag} = 1 << $self->{backwards};
	213	} else {
	214	my %level;
	215	$self->{backwardsFlag} = 0;
	216	for my $b (@{ $self->{backwards} }) {
	217	_checkLevel($b, "backwards");
	218	$level{$b} = 1;
	219	}
	220	for my $v (sort keys %level) {
	221	$self->{backwardsFlag} += 1 << $v;
	222	}
	223	}
	224
	225	defined $self->{rearrange} or $self->{rearrange} = [];
	226	ref $self->{rearrange}
	227	or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF";
	228
	229	# keys of $self->{rearrangeHash} are $self->{rearrange}.
	230	$self->{rearrangeHash} = undef;
	231
	232	if (@{ $self->{rearrange} }) {
	233	@{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
	234	}
	235
	236	$self->{normCode} = undef;
	237
	238	if (defined $self->{normalization}) {
	239	eval { require Unicode::Normalize };
	240	$@ and croak "Unicode::Normalize is required to normalize strings";
	241
	242	$CVgetCombinClass \|\|= \&Unicode::Normalize::getCombinClass;
	243
	244	if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default
	245	$self->{normCode} = \&Unicode::Normalize::NFD;
	246	}
	247	elsif ($self->{normalization} ne 'prenormalized') {
	248	my $norm = $self->{normalization};
	249	$self->{normCode} = sub {
	250	Unicode::Normalize::normalize($norm, shift);
	251	};
	252	eval { $self->{normCode}->("") }; # try
	253	$@ and croak "$PACKAGE unknown normalization form name: $norm";
	254	}
	255	}
	256	return;
	257	}
	258
	259	sub new
	260	{
	261	my $class = shift;
	262	my $self = bless { @_ }, $class;
	263
	264	### begin XS only ###
	265	if (! exists $self->{table} && !defined $self->{rewrite} &&
	266	!defined $self->{undefName} && !defined $self->{ignoreName} &&
	267	!defined $self->{undefChar} && !defined $self->{ignoreChar}) {
	268	$self->{__useXS} = \&_fetch_simple;
	269	} else {
	270	$self->{__useXS} = undef;
	271	}
	272	### end XS only ###
	273
	274	# keys of $self->{suppressHash} are $self->{suppress}.
	275	if ($self->{suppress} && @{ $self->{suppress} }) {
	276	@{ $self->{suppressHash} }{ @{ $self->{suppress} } } = ();
	277	} # before read_table()
	278
	279	# If undef is passed explicitly, no file is read.
	280	$self->{table} = $KeyFile if ! exists $self->{table};
	281	$self->read_table() if defined $self->{table};
	282
	283	if ($self->{entry}) {
	284	while ($self->{entry} =~ /([^\n]+)/g) {
	285	$self->parseEntry($1, TRUE);
	286	}
	287	}
	288
	289	# only in new(), not in change()
	290	$self->{level} \|\|= MaxLevel;
	291	$self->{UCA_Version} \|\|= UCA_Version();
	292
	293	$self->{overrideHangul} = FALSE
	294	if ! exists $self->{overrideHangul};
	295	$self->{overrideCJK} = FALSE
	296	if ! exists $self->{overrideCJK};
	297	$self->{normalization} = 'NFD'
	298	if ! exists $self->{normalization};
	299	$self->{rearrange} = $self->{rearrangeTable} \|\|
	300	($self->{UCA_Version} <= 11 ? $DefaultRearrange : [])
	301	if ! exists $self->{rearrange};
	302	$self->{backwards} = $self->{backwardsTable}
	303	if ! exists $self->{backwards};
	304	exists $self->{long_contraction} or $self->{long_contraction}
	305	= 22 <= $self->{UCA_Version} && $self->{UCA_Version} <= 24;
	306
	307	# checkCollator() will be called in change()
	308	$self->checkCollator();
	309
	310	return $self;
	311	}
	312
	313	sub parseAtmark {
	314	my $self = shift;
	315	my $line = shift; # after s/^\s*\@//
	316
	317	if ($line =~ /^version\s(\S)/) {
	318	$self->{versionTable} \|\|= $1;
	319	}
	320	elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9
	321	$self->{variableTable} \|\|= $1;
	322	}
	323	elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8
	324	$self->{alternateTable} \|\|= $1;
	325	}
	326	elsif ($line =~ /^backwards\s+(\S*)/) {
	327	push @{ $self->{backwardsTable} }, $1;
	328	}
	329	elsif ($line =~ /^forwards\s+(\S*)/) { # perhaps no use
	330	push @{ $self->{forwardsTable} }, $1;
	331	}
	332	elsif ($line =~ /^rearrange\s+(.)/) { # (\S) is NG
	333	push @{ $self->{rearrangeTable} }, _getHexArray($1);
	334	}
	335	}
	336
	337	sub read_table {
	338	my $self = shift;
	339
	340	### begin XS only ###
	341	if ($self->{__useXS}) {
	342	my @rest = _fetch_rest(); # complex matter need to parse
	343	for my $line (@rest) {
	344	next if $line =~ /^\s*#/;
	345
	346	if ($line =~ s/^\s*\@//) {
	347	$self->parseAtmark($line);
	348	} else {
	349	$self->parseEntry($line);
	350	}
	351	}
	352	return;
	353	}
	354	### end XS only ###
	355
	356	my($f, $fh);
	357	foreach my $d (@INC) {
	358	$f = File::Spec->catfile($d, @Path, $self->{table});
	359	last if open($fh, $f);
	360	$f = undef;
	361	}
	362	if (!defined $f) {
	363	$f = File::Spec->catfile(@Path, $self->{table});
	364	croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)");
	365	}
	366
	367	while (my $line = <$fh>) {
	368	next if $line =~ /^\s*#/;
	369
	370	if ($line =~ s/^\s*\@//) {
	371	$self->parseAtmark($line);
	372	} else {
	373	$self->parseEntry($line);
	374	}
	375	}
	376	close $fh;
	377	}
	378
	379
	380	##
	381	## get $line, parse it, and write an entry in $self
	382	##
	383	sub parseEntry
	384	{
	385	my $self = shift;
	386	my $line = shift;
	387	my $tailoring = shift;
	388	my($name, $entry, @uv, @key);
	389
	390	if (defined $self->{rewrite}) {
	391	$line = $self->{rewrite}->($line);
	392	}
	393
	394	return if $line !~ /^\s*[0-9A-Fa-f]/;
	395
	396	# removes comment and gets name
	397	$name = $1
	398	if $line =~ s/[#%]\s(.)//;
	399	return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
	400
	401	# gets element
	402	my($e, $k) = split /;/, $line;
	403	croak "Wrong Entry: <charList> must be separated by ';' from <collElement>"
	404	if ! $k;
	405
	406	@uv = _getHexArray($e);
	407	return if !@uv;
	408	return if @uv > 1 && $self->{suppressHash} && !$tailoring &&
	409	exists $self->{suppressHash}{$uv[0]};
	410	$entry = join(CODE_SEP, @uv); # in JCPS
	411
	412	if (defined $self->{undefChar} \|\| defined $self->{ignoreChar}) {
	413	my $ele = pack_U(@uv);
	414
	415	# regarded as if it were not stored in the table
	416	return
	417	if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
	418
	419	# replaced as completely ignorable
	420	$k = '[.0000.0000.0000.0000]'
	421	if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/;
	422	}
	423
	424	# replaced as completely ignorable
	425	$k = '[.0000.0000.0000.0000]'
	426	if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/;
	427
	428	my $is_L3_ignorable = TRUE;
	429
	430	foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed
	431	my $var = $arr =~ /\/; # exactly /^\/ but be lenient.
	432	my @wt = _getHexArray($arr);
	433	push @key, pack(VCE_TEMPLATE, $var, @wt);
	434	$is_L3_ignorable = FALSE
	435	if $wt[0] \|\| $wt[1] \|\| $wt[2];
	436	# Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable
	437	# is completely ignorable.
	438	# For expansion, an entry $is_L3_ignorable
	439	# if and only if "all" CEs are [.0000.0000.0000].
	440	}
	441
	442	$self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key;
	443
	444	if (@uv > 1) {
	445	if (!$self->{maxlength}{$uv[0]} \|\| $self->{maxlength}{$uv[0]} < @uv) {
	446	$self->{maxlength}{$uv[0]} = @uv;
	447	}
	448	}
	449	while (@uv > 2) {
	450	pop @uv;
	451	my $fake_entry = join(CODE_SEP, @uv); # in JCPS
	452	$self->{contraction}{$fake_entry} = 1;
	453	}
	454	}
	455
	456
	457	sub viewSortKey
	458	{
	459	my $self = shift;
	460	my $str = shift;
	461	$self->visualizeSortKey($self->getSortKey($str));
	462	}
	463
	464
	465	sub process
	466	{
	467	my $self = shift;
	468	my $str = shift;
	469	my $prep = $self->{preprocess};
	470	my $norm = $self->{normCode};
	471
	472	$str = &$prep($str) if ref $prep;
	473	$str = &$norm($str) if ref $norm;
	474	return $str;
	475	}
	476
	477	##
	478	## arrayref of JCPS = splitEnt(string to be collated)
	479	## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, TRUE)
	480	##
	481	sub splitEnt
	482	{
	483	my $self = shift;
	484	my $str = shift;
	485	my $wLen = shift; # with Length
	486
	487	my $map = $self->{mapping};
	488	my $max = $self->{maxlength};
	489	my $reH = $self->{rearrangeHash};
	490	my $vers = $self->{UCA_Version};
	491	my $ver9 = $vers >= 9 && $vers <= 11;
	492	my $long = $self->{long_contraction};
	493	my $uXS = $self->{__useXS}; ### XS only
	494
	495	my @buf;
	496
	497	# get array of Unicode code point of string.
	498	my @src = unpack_U($str);
	499
	500	# rearrangement:
	501	# Character positions are not kept if rearranged,
	502	# then neglected if $wLen is true.
	503	if ($reH && ! $wLen) {
	504	for (my $i = 0; $i < @src; $i++) {
	505	if (exists $reH->{ $src[$i] } && $i + 1 < @src) {
	506	($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]);
	507	$i++;
	508	}
	509	}
	510	}
	511
	512	# remove a code point marked as a completely ignorable.
	513	for (my $i = 0; $i < @src; $i++) {
	514	if ($vers <= 20 && _isIllegal($src[$i])) {
	515	$src[$i] = undef;
	516	} elsif ($ver9) {
	517	$src[$i] = undef if $map->{ $src[$i] }
	518	? @{ $map->{ $src[$i] } } == 0
	519	: $uXS && _ignorable_simple($src[$i]); ### XS only
	520	}
	521	}
	522
	523	for (my $i = 0; $i < @src; $i++) {
	524	my $jcps = $src[$i];
	525
	526	# skip removed code point
	527	if (! defined $jcps) {
	528	if ($wLen && @buf) {
	529	$buf[-1][2] = $i + 1;
	530	}
	531	next;
	532	}
	533
	534	my $i_orig = $i;
	535
	536	# find contraction
	537	if ($max->{$jcps}) {
	538	my $temp_jcps = $jcps;
	539	my $jcpsLen = 1;
	540	my $maxLen = $max->{$jcps};
	541
	542	for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) {
	543	next if ! defined $src[$p];
	544	$temp_jcps .= CODE_SEP . $src[$p];
	545	$jcpsLen++;
	546	if ($map->{$temp_jcps}) {
	547	$jcps = $temp_jcps;
	548	$i = $p;
	549	}
	550	}
	551
	552	# discontiguous contraction with Combining Char (cf. UTS#10, S2.1).
	553	# This process requires Unicode::Normalize.
	554	# If "normalization" is undef, here should be skipped always
	555	# (in spite of bool value of $CVgetCombinClass),
	556	# since canonical ordering cannot be expected.
	557	# Blocked combining character should not be contracted.
	558
	559	# $self->{normCode} is false in the case of "prenormalized".
	560	if ($self->{normalization}) {
	561	my $cont = $self->{contraction};
	562	my $preCC = 0;
	563	my $preCC_uc = 0;
	564	my $jcps_uc = $jcps;
	565	my(@out, @out_uc);
	566
	567	for (my $p = $i + 1; $p < @src; $p++) {
	568	next if ! defined $src[$p];
	569	my $curCC = $CVgetCombinClass->($src[$p]);
	570	last unless $curCC;
	571	my $tail = CODE_SEP . $src[$p];
	572
	573	if ($preCC != $curCC && $map->{$jcps.$tail}) {
	574	$jcps .= $tail;
	575	push @out, $p;
	576	} else {
	577	$preCC = $curCC;
	578	}
	579
	580	next if !$long;
	581
	582	if ($preCC_uc != $curCC && ($map->{$jcps_uc.$tail} \|\|
	583	$cont->{$jcps_uc.$tail})) {
	584	$jcps_uc .= $tail;
	585	push @out_uc, $p;
	586	} else {
	587	$preCC_uc = $curCC;
	588	}
	589	}
	590
	591	if (@out_uc && $map->{$jcps_uc}) {
	592	$jcps = $jcps_uc;
	593	$src[$_] = undef for @out_uc;
	594	} else {
	595	$src[$_] = undef for @out;
	596	}
	597	}
	598	}
	599
	600	# skip completely ignorable
	601	if ($map->{$jcps} ? @{ $map->{$jcps} } == 0 :
	602	$uXS && $jcps !~ /;/ && _ignorable_simple($jcps)) { ### XS only
	603	if ($wLen && @buf) {
	604	$buf[-1][2] = $i + 1;
	605	}
	606	next;
	607	}
	608
	609	push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps;
	610	}
	611	return \@buf;
	612	}
	613
	614	##
	615	## VCE = _pack_override(input, codepoint, derivCode)
	616	##
	617	sub _pack_override ($$$) {
	618	my $r = shift;
	619	my $u = shift;
	620	my $der = shift;
	621
	622	if (ref $r) {
	623	return pack(VCE_TEMPLATE, NON_VAR, @$r);
	624	} elsif (defined $r) {
	625	return pack(VCE_TEMPLATE, NON_VAR, $r, Min2Wt, Min3Wt, $u);
	626	} else {
	627	$u = 0xFFFD if 0x10FFFF < $u;
	628	return $der->($u);
	629	}
	630	}
	631
	632	##
	633	## list of VCE = getWt(JCPS)
	634	##
	635	sub getWt
	636	{
	637	my $self = shift;
	638	my $u = shift;
	639	my $map = $self->{mapping};
	640	my $der = $self->{derivCode};
	641	my $out = $self->{overrideOut};
	642	my $uXS = $self->{__useXS}; ### XS only
	643
	644	return if !defined $u;
	645	return $self->varCE($HighestVCE) if $u eq 0xFFFF && $self->{highestFFFF};
	646	return $self->varCE($minimalVCE) if $u eq 0xFFFE && $self->{minimalFFFE};
	647	$u = 0xFFFD if $u !~ /;/ && 0x10FFFF < $u && !$out;
	648
	649	my @ce;
	650	if ($map->{$u}) {
	651	@ce = @{ $map->{$u} }; # $u may be a contraction
	652	### begin XS only ###
	653	} elsif ($uXS && _exists_simple($u)) {
	654	@ce = _fetch_simple($u);
	655	### end XS only ###
	656	} elsif (Hangul_SIni <= $u && $u <= Hangul_SFin) {
	657	my $hang = $self->{overrideHangul};
	658	if ($hang) {
	659	@ce = map _pack_override($_, $u, $der), $hang->($u);
	660	} elsif (!defined $hang) {
	661	@ce = $der->($u);
	662	} else {
	663	my $max = $self->{maxlength};
	664	my @decH = _decompHangul($u);
	665
	666	if (@decH == 2) {
	667	my $contract = join(CODE_SEP, @decH);
	668	@decH = ($contract) if $map->{$contract};
	669	} else { # must be <@decH == 3>
	670	if ($max->{$decH[0]}) {
	671	my $contract = join(CODE_SEP, @decH);
	672	if ($map->{$contract}) {
	673	@decH = ($contract);
	674	} else {
	675	$contract = join(CODE_SEP, @decH[0,1]);
	676	$map->{$contract} and @decH = ($contract, $decH[2]);
	677	}
	678	# even if V's ignorable, LT contraction is not supported.
	679	# If such a situation were required, NFD should be used.
	680	}
	681	if (@decH == 3 && $max->{$decH[1]}) {
	682	my $contract = join(CODE_SEP, @decH[1,2]);
	683	$map->{$contract} and @decH = ($decH[0], $contract);
	684	}
	685	}
	686
	687	@ce = map({
	688	$map->{$_} ? @{ $map->{$_} } :
	689	$uXS && _exists_simple($_) ? _fetch_simple($_) : ### XS only
	690	$der->($_);
	691	} @decH);
	692	}
	693	} elsif ($out && 0x10FFFF < $u) {
	694	@ce = map _pack_override($_, $u, $der), $out->($u);
	695	} else {
	696	my $cjk = $self->{overrideCJK};
	697	my $vers = $self->{UCA_Version};
	698	if ($cjk && _isUIdeo($u, $vers)) {
	699	@ce = map _pack_override($_, $u, $der), $cjk->($u);
	700	} elsif ($vers == 8 && defined $cjk && _isUIdeo($u, 0)) {
	701	@ce = _uideoCE_8($u);
	702	} else {
	703	@ce = $der->($u);
	704	}
	705	}
	706	return map $self->varCE($_), @ce;
	707	}
	708
	709
	710	##
	711	## string sortkey = getSortKey(string arg)
	712	##
	713	sub getSortKey
	714	{
	715	my $self = shift;
	716	my $orig = shift;
	717	my $str = $self->process($orig);
	718	my $rEnt = $self->splitEnt($str); # get an arrayref of JCPS
	719	my $vers = $self->{UCA_Version};
	720	my $term = $self->{hangul_terminator};
	721	my $lev = $self->{level};
	722	my $iden = $self->{identical};
	723
	724	my @buf; # weight arrays
	725	if ($term) {
	726	my $preHST = '';
	727	my $termCE = $self->varCE(pack(VCE_TEMPLATE, NON_VAR, $term, 0,0,0));
	728	foreach my $jcps (@$rEnt) {
	729	# weird things like VL, TL-contraction are not considered!
	730	my $curHST = join '', map getHST($_, $vers), split /;/, $jcps;
	731	if ($preHST && !$curHST \|\| # hangul before non-hangul
	732	$preHST =~ /L\z/ && $curHST =~ /^T/ \|\|
	733	$preHST =~ /V\z/ && $curHST =~ /^L/ \|\|
	734	$preHST =~ /T\z/ && $curHST =~ /^[LV]/) {
	735	push @buf, $termCE;
	736	}
	737	$preHST = $curHST;
	738	push @buf, $self->getWt($jcps);
	739	}
	740	push @buf, $termCE if $preHST; # end at hangul
	741	} else {
	742	foreach my $jcps (@$rEnt) {
	743	push @buf, $self->getWt($jcps);
	744	}
	745	}
	746
	747	my $rkey = $self->mk_SortKey(\@buf); ### XS only
	748
	749	if ($iden \|\| $vers >= 26 && $lev == MaxLevel) {
	750	$rkey .= LEVEL_SEP;
	751	$rkey .= pack(TIE_TEMPLATE, unpack_U($str)) if $iden;
	752	}
	753	return $rkey;
	754	}
	755
	756
	757	##
	758	## int compare = cmp(string a, string b)
	759	##
	760	sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) }
	761	sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) }
	762	sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) }
	763	sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) }
	764	sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) }
	765	sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) }
	766	sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) }
	767
	768	##
	769	## list[strings] sorted = sort(list[strings] arg)
	770	##
	771	sub sort {
	772	my $obj = shift;
	773	return
	774	map { $_->[1] }
	775	sort{ $a->[0] cmp $b->[0] }
	776	map [ $obj->getSortKey($_), $_ ], @_;
	777	}
	778
	779
	780	##
	781	## bool _nonIgnorAtLevel(arrayref weights, int level)
	782	##
	783	sub _nonIgnorAtLevel($$)
	784	{
	785	my $wt = shift;
	786	return if ! defined $wt;
	787	my $lv = shift;
	788	return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE;
	789	}
	790
	791	##
	792	## bool _eqArray(
	793	## arrayref of arrayref[weights] source,
	794	## arrayref of arrayref[weights] substr,
	795	## int level)
	796	## * comparison of graphemes vs graphemes.
	797	## @$source >= @$substr must be true (check it before call this);
	798	##
	799	sub _eqArray($$$)
	800	{
	801	my $source = shift;
	802	my $substr = shift;
	803	my $lev = shift;
	804
	805	for my $g (0..@$substr-1){
	806	# Do the $g'th graphemes have the same number of AV weights?
	807	return if @{ $source->[$g] } != @{ $substr->[$g] };
	808
	809	for my $w (0..@{ $substr->[$g] }-1) {
	810	for my $v (0..$lev-1) {
	811	return if $source->[$g][$w][$v] != $substr->[$g][$w][$v];
	812	}
	813	}
	814	}
	815	return 1;
	816	}
	817
	818	##
	819	## (int position, int length)
	820	## int position = index(string, substring, position, [undoc'ed global])
	821	##
	822	## With "global" (only for the list context),
	823	## returns list of arrayref[position, length].
	824	##
	825	sub index
	826	{
	827	my $self = shift;
	828	$self->{preprocess} and
	829	croak "Don't use Preprocess with index(), match(), etc.";
	830	$self->{normCode} and
	831	croak "Don't use Normalization with index(), match(), etc.";
	832
	833	my $str = shift;
	834	my $len = length($str);
	835	my $sub = shift;
	836	my $subE = $self->splitEnt($sub);
	837	my $pos = @_ ? shift : 0;
	838	$pos = 0 if $pos < 0;
	839	my $glob = shift;
	840
	841	my $lev = $self->{level};
	842	my $v2i = $self->{UCA_Version} >= 9 &&
	843	$self->{variable} ne 'non-ignorable';
	844
	845	if (! @$subE) {
	846	my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos;
	847	return $glob
	848	? map([$_, 0], $temp..$len)
	849	: wantarray ? ($temp,0) : $temp;
	850	}
	851	$len < $pos
	852	and return wantarray ? () : NOMATCHPOS;
	853	my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE);
	854	@$strE
	855	or return wantarray ? () : NOMATCHPOS;
	856
	857	my(@strWt, @iniPos, @finPos, @subWt, @g_ret);
	858
	859	my $last_is_variable;
	860	for my $vwt (map $self->getWt($_), @$subE) {
	861	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
	862	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
	863
	864	# "Ignorable (L1, L2) after Variable" since track. v. 9
	865	if ($v2i) {
	866	if ($var) {
	867	$last_is_variable = TRUE;
	868	}
	869	elsif (!$wt[0]) { # ignorable
	870	$to_be_pushed = FALSE if $last_is_variable;
	871	}
	872	else {
	873	$last_is_variable = FALSE;
	874	}
	875	}
	876
	877	if (@subWt && !$var && !$wt[0]) {
	878	push @{ $subWt[-1] }, \@wt if $to_be_pushed;
	879	} elsif ($to_be_pushed) {
	880	push @subWt, [ \@wt ];
	881	}
	882	# else ===> skipped
	883	}
	884
	885	my $count = 0;
	886	my $end = @$strE - 1;
	887
	888	$last_is_variable = FALSE; # reuse
	889	for (my $i = 0; $i <= $end; ) { # no $i++
	890	my $found_base = 0;
	891
	892	# fetch a grapheme
	893	while ($i <= $end && $found_base == 0) {
	894	for my $vwt ($self->getWt($strE->[$i][0])) {
	895	my($var, @wt) = unpack(VCE_TEMPLATE, $vwt);
	896	my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev);
	897
	898	# "Ignorable (L1, L2) after Variable" since track. v. 9
	899	if ($v2i) {
	900	if ($var) {
	901	$last_is_variable = TRUE;
	902	}
	903	elsif (!$wt[0]) { # ignorable
	904	$to_be_pushed = FALSE if $last_is_variable;
	905	}
	906	else {
	907	$last_is_variable = FALSE;
	908	}
	909	}
	910
	911	if (@strWt && !$var && !$wt[0]) {
	912	push @{ $strWt[-1] }, \@wt if $to_be_pushed;
	913	$finPos[-1] = $strE->[$i][2];
	914	} elsif ($to_be_pushed) {
	915	push @strWt, [ \@wt ];
	916	push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1];
	917	$finPos[-1] = NOMATCHPOS if $found_base;
	918	push @finPos, $strE->[$i][2];
	919	$found_base++;
	920	}
	921	# else ===> no-op
	922	}
	923	$i++;
	924	}
	925
	926	# try to match
	927	while ( @strWt > @subWt \|\| (@strWt == @subWt && $i > $end) ) {
	928	if ($iniPos[0] != NOMATCHPOS &&
	929	$finPos[$#subWt] != NOMATCHPOS &&
	930	_eqArray(\@strWt, \@subWt, $lev)) {
	931	my $temp = $iniPos[0] + $pos;
	932
	933	if ($glob) {
	934	push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]];
	935	splice @strWt, 0, $#subWt;
	936	splice @iniPos, 0, $#subWt;
	937	splice @finPos, 0, $#subWt;
	938	}
	939	else {
	940	return wantarray
	941	? ($temp, $finPos[$#subWt] - $iniPos[0])
	942	: $temp;
	943	}
	944	}
	945	shift @strWt;
	946	shift @iniPos;
	947	shift @finPos;
	948	}
	949	}
	950
	951	return $glob
	952	? @g_ret
	953	: wantarray ? () : NOMATCHPOS;
	954	}
	955
	956	##
	957	## scalarref to matching part = match(string, substring)
	958	##
	959	sub match
	960	{
	961	my $self = shift;
	962	if (my($pos,$len) = $self->index($_[0], $_[1])) {
	963	my $temp = substr($_[0], $pos, $len);
	964	return wantarray ? $temp : \$temp;
	965	# An lvalue ref \substr should be avoided,
	966	# since its value is affected by modification of its referent.
	967	}
	968	else {
	969	return;
	970	}
	971	}
	972
	973	##
	974	## arrayref matching parts = gmatch(string, substring)
	975	##
	976	sub gmatch
	977	{
	978	my $self = shift;
	979	my $str = shift;
	980	my $sub = shift;
	981	return map substr($str, $_->[0], $_->[1]),
	982	$self->index($str, $sub, 0, 'g');
	983	}
	984
	985	##
	986	## bool subst'ed = subst(string, substring, replace)
	987	##
	988	sub subst
	989	{
	990	my $self = shift;
	991	my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
	992
	993	if (my($pos,$len) = $self->index($_[0], $_[1])) {
	994	if ($code) {
	995	my $mat = substr($_[0], $pos, $len);
	996	substr($_[0], $pos, $len, $code->($mat));
	997	} else {
	998	substr($_[0], $pos, $len, $_[2]);
	999	}
	1000	return TRUE;
	1001	}
	1002	else {
	1003	return FALSE;
	1004	}
	1005	}
	1006
	1007	##
	1008	## int count = gsubst(string, substring, replace)
	1009	##
	1010	sub gsubst
	1011	{
	1012	my $self = shift;
	1013	my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE;
	1014	my $cnt = 0;
	1015
	1016	# Replacement is carried out from the end, then use reverse.
	1017	for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) {
	1018	if ($code) {
	1019	my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]);
	1020	substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat));
	1021	} else {
	1022	substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]);
	1023	}
	1024	$cnt++;
	1025	}
	1026	return $cnt;
	1027	}
	1028
	1029	1;
	1030	__END__
	1031
	1032	=head1 NAME
	1033
	1034	Unicode::Collate - Unicode Collation Algorithm
	1035
	1036	=head1 SYNOPSIS
	1037
	1038	use Unicode::Collate;
	1039
	1040	#construct
	1041	$Collator = Unicode::Collate->new(%tailoring);
	1042
	1043	#sort
	1044	@sorted = $Collator->sort(@not_sorted);
	1045
	1046	#compare
	1047	$result = $Collator->cmp($a, $b); # returns 1, 0, or -1.
	1048
	1049	B<Note:> Strings in C<@not_sorted>, C<$a> and C<$b> are interpreted
	1050	according to Perl's Unicode support. See L<perlunicode>,
	1051	L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
	1052	Otherwise you can use C<preprocess> or should decode them before.
	1053
	1054	=head1 DESCRIPTION
	1055
	1056	This module is an implementation of Unicode Technical Standard #10
	1057	(a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA).
	1058
	1059	=head2 Constructor and Tailoring
	1060
	1061	The C<new> method returns a collator object. If new() is called
	1062	with no parameters, the collator should do the default collation.
	1063
	1064	$Collator = Unicode::Collate->new(
	1065	UCA_Version => $UCA_Version,
	1066	alternate => $alternate, # alias for 'variable'
	1067	backwards => $levelNumber, # or \@levelNumbers
	1068	entry => $element,
	1069	hangul_terminator => $term_primary_weight,
	1070	highestFFFF => $bool,
	1071	identical => $bool,
	1072	ignoreName => qr/$ignoreName/,
	1073	ignoreChar => qr/$ignoreChar/,
	1074	ignore_level2 => $bool,
	1075	katakana_before_hiragana => $bool,
	1076	level => $collationLevel,
	1077	long_contraction => $bool,
	1078	minimalFFFE => $bool,
	1079	normalization => $normalization_form,
	1080	overrideCJK => \&overrideCJK,
	1081	overrideHangul => \&overrideHangul,
	1082	preprocess => \&preprocess,
	1083	rearrange => \@charList,
	1084	rewrite => \&rewrite,
	1085	suppress => \@charList,
	1086	table => $filename,
	1087	undefName => qr/$undefName/,
	1088	undefChar => qr/$undefChar/,
	1089	upper_before_lower => $bool,
	1090	variable => $variable,
	1091	);
	1092
	1093	=over 4
	1094
	1095	=item UCA_Version
	1096
	1097	If the revision (previously "tracking version") number of UCA is given,
	1098	behavior of that revision is emulated on collating.
	1099	If omitted, the return value of C<UCA_Version()> is used.
	1100
	1101	The following revisions are supported. The default is 32.
	1102
	1103	UCA Unicode Standard DUCET (@version)
	1104	-------------------------------------------------------
	1105	8 3.1 3.0.1 (3.0.1d9)
	1106	9 3.1 with Corrigendum 3 3.1.1 (3.1.1)
	1107	11 4.0 4.0.0 (4.0.0)
	1108	14 4.1.0 4.1.0 (4.1.0)
	1109	16 5.0 5.0.0 (5.0.0)
	1110	18 5.1.0 5.1.0 (5.1.0)
	1111	20 5.2.0 5.2.0 (5.2.0)
	1112	22 6.0.0 6.0.0 (6.0.0)
	1113	24 6.1.0 6.1.0 (6.1.0)
	1114	26 6.2.0 6.2.0 (6.2.0)
	1115	28 6.3.0 6.3.0 (6.3.0)
	1116	30 7.0.0 7.0.0 (7.0.0)
	1117	32 8.0.0 8.0.0 (8.0.0)
	1118
	1119	* See below for C<long_contraction> with C<UCA_Version> 22 and 24.
	1120
	1121	* Noncharacters (e.g. U+FFFF) are not ignored, and can be overridden
	1122	since C<UCA_Version> 22.
	1123
	1124	* Out-of-range codepoints (greater than U+10FFFF) are not ignored,
	1125	and can be overridden since C<UCA_Version> 22.
	1126
	1127	* Fully ignorable characters were ignored, and would not interrupt
	1128	contractions with C<UCA_Version> 9 and 11.
	1129
	1130	* Treatment of ignorables after variables and some behaviors
	1131	were changed at C<UCA_Version> 9.
	1132
	1133	* Characters regarded as CJK unified ideographs (cf. C<overrideCJK>)
	1134	depend on C<UCA_Version>.
	1135
	1136	* Many hangul jamo are assigned at C<UCA_Version> 20, that will affect
	1137	C<hangul_terminator>.
	1138
	1139	=item alternate
	1140
	1141	-- see 3.2.2 Alternate Weighting, version 8 of UTS #10
	1142
	1143	For backward compatibility, C<alternate> (old name) can be used
	1144	as an alias for C<variable>.
	1145
	1146	=item backwards
	1147
	1148	-- see 3.4 Backward Accents, UTS #10.
	1149
	1150	backwards => $levelNumber or \@levelNumbers
	1151
	1152	Weights in reverse order; ex. level 2 (diacritic ordering) in French.
	1153	If omitted (or C<$levelNumber> is C<undef> or C<\@levelNumbers> is C<[]>),
	1154	forwards at all the levels.
	1155
	1156	=item entry
	1157
	1158	-- see 5 Tailoring; 9.1 Allkeys File Format, UTS #10.
	1159
	1160	If the same character (or a sequence of characters) exists
	1161	in the collation element table through C<table>,
	1162	mapping to collation elements is overridden.
	1163	If it does not exist, the mapping is defined additionally.
	1164
	1165	entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
	1166	0063 0068 ; [.0E6A.0020.0002.0063] # ch
	1167	0043 0068 ; [.0E6A.0020.0007.0043] # Ch
	1168	0043 0048 ; [.0E6A.0020.0008.0043] # CH
	1169	006C 006C ; [.0F4C.0020.0002.006C] # ll
	1170	004C 006C ; [.0F4C.0020.0007.004C] # Ll
	1171	004C 004C ; [.0F4C.0020.0008.004C] # LL
	1172	00F1 ; [.0F7B.0020.0002.00F1] # n-tilde
	1173	006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde
	1174	00D1 ; [.0F7B.0020.0008.00D1] # N-tilde
	1175	004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde
	1176	ENTRY
	1177
	1178	entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt)
	1179	00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e>
	1180	00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E>
	1181	ENTRY
	1182
	1183	B<NOTE:> The code point in the UCA file format (before C<';'>)
	1184	B<must> be a Unicode code point (defined as hexadecimal),
	1185	but not a native code point.
	1186	So C<0063> must always denote C<U+0063>,
	1187	but not a character of C<"\x63">.
	1188
	1189	Weighting may vary depending on collation element table.
	1190	So ensure the weights defined in C<entry> will be consistent with
	1191	those in the collation element table loaded via C<table>.
	1192
	1193	In DUCET v4.0.0, primary weight of C<C> is C<0E60>
	1194	and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A>
	1195	(as a value between C<0E60> and C<0E6D>)
	1196	makes ordering as C<C E<lt> CH E<lt> D>.
	1197	Exactly speaking DUCET already has some characters between C<C> and C<D>:
	1198	C<small capital C> (C<U+1D04>) with primary weight C<0E64>,
	1199	C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>,
	1200	and C<c-curl> (C<U+0255>) with C<0E69>.
	1201	Then primary weight C<0E6A> for C<CH> makes C<CH>
	1202	ordered between C<c-curl> and C<D>.
	1203
	1204	=item hangul_terminator
	1205
	1206	-- see 7.1.4 Trailing Weights, UTS #10.
	1207
	1208	If a true value is given (non-zero but should be positive),
	1209	it will be added as a terminator primary weight to the end of
	1210	every standard Hangul syllable. Secondary and any higher weights
	1211	for terminator are set to zero.
	1212	If the value is false or C<hangul_terminator> key does not exist,
	1213	insertion of terminator weights will not be performed.
	1214
	1215	Boundaries of Hangul syllables are determined
	1216	according to conjoining Jamo behavior in F<the Unicode Standard>
	1217	and F<HangulSyllableType.txt>.
	1218
	1219	B<Implementation Note:>
	1220	(1) For expansion mapping (Unicode character mapped
	1221	to a sequence of collation elements), a terminator will not be added
	1222	between collation elements, even if Hangul syllable boundary exists there.
	1223	Addition of terminator is restricted to the next position
	1224	to the last collation element.
	1225
	1226	(2) Non-conjoining Hangul letters
	1227	(Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not
	1228	automatically terminated with a terminator primary weight.
	1229	These characters may need terminator included in a collation element
	1230	table beforehand.
	1231
	1232	=item highestFFFF
	1233
	1234	-- see 2.4 Tailored noncharacter weights, UTS #35 (LDML) Part 5: Collation.
	1235
	1236	If the parameter is made true, C<U+FFFF> has a highest primary weight.
	1237	When a boolean of C<$coll-E<gt>ge($str, "abc")> and
	1238	C<$coll-E<gt>le($str, "abc\x{FFFF}")> is true, it is expected that C<$str>
	1239	begins with C<"abc">, or another primary equivalent.
	1240	C<$str> may be C<"abcd">, C<"abc012">, but should not include C<U+FFFF>
	1241	such as C<"abc\x{FFFF}xyz">.
	1242
	1243	C<$coll-E<gt>le($str, "abc\x{FFFF}")> works like C<$coll-E<gt>lt($str, "abd")>
	1244	almost, but the latter has a problem that you should know which letter is
	1245	next to C<c>. For a certain language where C<ch> as the next letter,
	1246	C<"abch"> is greater than C<"abc\x{FFFF}">, but less than C<"abd">.
	1247
	1248	Note:
	1249	This is equivalent to C<(entry =E<gt> 'FFFF ; [.FFFE.0020.0005.FFFF]')>.
	1250	Any other character than C<U+FFFF> can be tailored by C<entry>.
	1251
	1252	=item identical
	1253
	1254	-- see A.3 Deterministic Comparison, UTS #10.
	1255
	1256	By default, strings whose weights are equal should be equal,
	1257	even though their code points are not equal.
	1258	Completely ignorable characters are ignored.
	1259
	1260	If the parameter is made true, a final, tie-breaking level is used.
	1261	If no difference of weights is found after the comparison through
	1262	all the level specified by C<level>, the comparison with code points
	1263	will be performed.
	1264	For the tie-breaking comparison, the sort key has code points
	1265	of the original string appended.
	1266	Completely ignorable characters are not ignored.
	1267
	1268	If C<preprocess> and/or C<normalization> is applied, the code points
	1269	of the string after them (in NFD by default) are used.
	1270
	1271	=item ignoreChar
	1272
	1273	=item ignoreName
	1274
	1275	-- see 3.6 Variable Weighting, UTS #10.
	1276
	1277	Makes the entry in the table completely ignorable;
	1278	i.e. as if the weights were zero at all level.
	1279
	1280	Through C<ignoreChar>, any character matching C<qr/$ignoreChar/>
	1281	will be ignored. Through C<ignoreName>, any character whose name
	1282	(given in the C<table> file as a comment) matches C<qr/$ignoreName/>
	1283	will be ignored.
	1284
	1285	E.g. when 'a' and 'e' are ignorable,
	1286	'element' is equal to 'lament' (or 'lmnt').
	1287
	1288	=item ignore_level2
	1289
	1290	-- see 5.1 Parametric Tailoring, UTS #10.
	1291
	1292	By default, case-sensitive comparison (that is level 3 difference)
	1293	won't ignore accents (that is level 2 difference).
	1294
	1295	If the parameter is made true, accents (and other primary ignorable
	1296	characters) are ignored, even though cases are taken into account.
	1297
	1298	B<NOTE>: C<level> should be 3 or greater.
	1299
	1300	=item katakana_before_hiragana
	1301
	1302	-- see 7.2 Tertiary Weight Table, UTS #10.
	1303
	1304	By default, hiragana is before katakana.
	1305	If the parameter is made true, this is reversed.
	1306
	1307	B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana
	1308	distinctions must occur in level 3, and their weights at level 3 must be
	1309	same as those mentioned in 7.3.1, UTS #10.
	1310	If you define your collation elements which violate this requirement,
	1311	this parameter does not work validly.
	1312
	1313	=item level
	1314
	1315	-- see 4.3 Form Sort Key, UTS #10.
	1316
	1317	Set the maximum level.
	1318	Any higher levels than the specified one are ignored.
	1319
	1320	Level 1: alphabetic ordering
	1321	Level 2: diacritic ordering
	1322	Level 3: case ordering
	1323	Level 4: tie-breaking (e.g. in the case when variable is 'shifted')
	1324
	1325	ex.level => 2,
	1326
	1327	If omitted, the maximum is the 4th.
	1328
	1329	B<NOTE:> The DUCET includes weights over 0xFFFF at the 4th level.
	1330	But this module only uses weights within 0xFFFF.
	1331	When C<variable> is 'blanked' or 'non-ignorable' (other than 'shifted'
	1332	and 'shift-trimmed'), the level 4 may be unreliable.
	1333
	1334	See also C<identical>.
	1335
	1336	=item long_contraction
	1337
	1338	-- see 3.8.2 Well-Formedness of the DUCET, 4.2 Produce Array, UTS #10.
	1339
	1340	If the parameter is made true, for a contraction with three or more
	1341	characters (here nicknamed "long contraction"), initial substrings
	1342	will be handled.
	1343	For example, a contraction ABC, where A is a starter, and B and C
	1344	are non-starters (character with non-zero combining character class),
	1345	will be detected even if there is not AB as a contraction.
	1346
	1347	B<Default:> Usually false.
	1348	If C<UCA_Version> is 22 or 24, and the value of C<long_contraction>
	1349	is not specified in C<new()>, a true value is set implicitly.
	1350	This is a workaround to pass Conformance Tests for Unicode 6.0.0 and 6.1.0.
	1351
	1352	C<change()> handles C<long_contraction> explicitly only.
	1353	If C<long_contraction> is not specified in C<change()>, even though
	1354	C<UCA_Version> is changed, C<long_contraction> will not be changed.
	1355
	1356	B<Limitation:> Scanning non-starters is one-way (no back tracking).
	1357	If AB is found but not ABC is not found, other long contraction where
	1358	the first character is A and the second is not B may not be found.
	1359
	1360	Under C<(normalization =E<gt> undef)>, detection step of discontiguous
	1361	contractions will be skipped.
	1362
	1363	B<Note:> The following contractions in DUCET are not considered
	1364	in steps S2.1.1 to S2.1.3, where they are discontiguous.
	1365
	1366	0FB2 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC RR)
	1367	0FB3 0F71 0F80 (TIBETAN VOWEL SIGN VOCALIC LL)
	1368
	1369	For example C<TIBETAN VOWEL SIGN VOCALIC RR> with C<COMBINING TILDE OVERLAY>
	1370	(C<U+0344>) is C<0FB2 0344 0F71 0F80> in NFD.
	1371	In this case C<0FB2 0F80> (C<TIBETAN VOWEL SIGN VOCALIC R>) is detected,
	1372	instead of C<0FB2 0F71 0F80>.
	1373	Inserted C<0344> makes C<0FB2 0F71 0F80> discontiguous and lack of
	1374	contraction C<0FB2 0F71> prohibits C<0FB2 0F71 0F80> from being detected.
	1375
	1376	=item minimalFFFE
	1377
	1378	-- see 1.1.1 U+FFFE, UTS #35 (LDML) Part 5: Collation.
	1379
	1380	If the parameter is made true, C<U+FFFE> has a minimal primary weight.
	1381	The comparison between C<"$a1\x{FFFE}$a2"> and C<"$b1\x{FFFE}$b2">
	1382	first compares C<$a1> and C<$b1> at level 1, and
	1383	then C<$a2> and C<$b2> at level 1, as followed.
	1384
	1385	"ab\x{FFFE}a"
	1386	"Ab\x{FFFE}a"
	1387	"ab\x{FFFE}c"
	1388	"Ab\x{FFFE}c"
	1389	"ab\x{FFFE}xyz"
	1390	"abc\x{FFFE}def"
	1391	"abc\x{FFFE}xYz"
	1392	"aBc\x{FFFE}xyz"
	1393	"abcX\x{FFFE}def"
	1394	"abcx\x{FFFE}xyz"
	1395	"b\x{FFFE}aaa"
	1396	"bbb\x{FFFE}a"
	1397
	1398	Note:
	1399	This is equivalent to C<(entry =E<gt> 'FFFE ; [.0001.0020.0005.FFFE]')>.
	1400	Any other character than C<U+FFFE> can be tailored by C<entry>.
	1401
	1402	=item normalization
	1403
	1404	-- see 4.1 Normalize, UTS #10.
	1405
	1406	If specified, strings are normalized before preparation of sort keys
	1407	(the normalization is executed after preprocess).
	1408
	1409	A form name C<Unicode::Normalize::normalize()> accepts will be applied
	1410	as C<$normalization_form>.
	1411	Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>.
	1412	See C<Unicode::Normalize::normalize()> for detail.
	1413	If omitted, C<'NFD'> is used.
	1414
	1415	C<normalization> is performed after C<preprocess> (if defined).
	1416
	1417	Furthermore, special values, C<undef> and C<"prenormalized">, can be used,
	1418	though they are not concerned with C<Unicode::Normalize::normalize()>.
	1419
	1420	If C<undef> (not a string C<"undef">) is passed explicitly
	1421	as the value for this key,
	1422	any normalization is not carried out (this may make tailoring easier
	1423	if any normalization is not desired). Under C<(normalization =E<gt> undef)>,
	1424	only contiguous contractions are resolved;
	1425	e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>,
	1426	C<A-cedilla-ring> would be primary equal to C<A>.
	1427	In this point,
	1428	C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })>
	1429	B<is not> equivalent to C<(normalization =E<gt> 'NFD')>.
	1430
	1431	In the case of C<(normalization =E<gt> "prenormalized")>,
	1432	any normalization is not performed, but
	1433	discontiguous contractions with combining characters are performed.
	1434	Therefore
	1435	C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })>
	1436	B<is> equivalent to C<(normalization =E<gt> 'NFD')>.
	1437	If source strings are finely prenormalized,
	1438	C<(normalization =E<gt> 'prenormalized')> may save time for normalization.
	1439
	1440	Except C<(normalization =E<gt> undef)>,
	1441	B<Unicode::Normalize> is required (see also B<CAVEAT>).
	1442
	1443	=item overrideCJK
	1444
	1445	-- see 7.1 Derived Collation Elements, UTS #10.
	1446
	1447	By default, CJK unified ideographs are ordered in Unicode codepoint
	1448	order, but those in the CJK Unified Ideographs block are less than
	1449	those in the CJK Unified Ideographs Extension A etc.
	1450
	1451	In the CJK Unified Ideographs block:
	1452	U+4E00..U+9FA5 if UCA_Version is 8, 9 or 11.
	1453	U+4E00..U+9FBB if UCA_Version is 14 or 16.
	1454	U+4E00..U+9FC3 if UCA_Version is 18.
	1455	U+4E00..U+9FCB if UCA_Version is 20 or 22.
	1456	U+4E00..U+9FCC if UCA_Version is 24 to 30.
	1457	U+4E00..U+9FD5 if UCA_Version is 32.
	1458
	1459	In the CJK Unified Ideographs Extension blocks:
	1460	Ext.A (U+3400..U+4DB5) and Ext.B (U+20000..U+2A6D6) in any UCA_Version.
	1461	Ext.C (U+2A700..U+2B734) if UCA_Version is 20 or later.
	1462	Ext.D (U+2B740..U+2B81D) if UCA_Version is 22 or later.
	1463	Ext.E (U+2B820..U+2CEA1) if UCA_Version is 32.
	1464
	1465	Through C<overrideCJK>, ordering of CJK unified ideographs (including
	1466	extensions) can be overridden.
	1467
	1468	ex. CJK unified ideographs in the JIS code point order.
	1469
	1470	overrideCJK => sub {
	1471	my $u = shift; # get a Unicode codepoint
	1472	my $b = pack('n', $u); # to UTF-16BE
	1473	my $s = your_unicode_to_sjis_converter($b); # convert
	1474	my $n = unpack('n', $s); # convert sjis to short
	1475	[ $n, 0x20, 0x2, $u ]; # return the collation element
	1476	},
	1477
	1478	The return value may be an arrayref of 1st to 4th weights as shown
	1479	above. The return value may be an integer as the primary weight
	1480	as shown below. If C<undef> is returned, the default derived
	1481	collation element will be used.
	1482
	1483	overrideCJK => sub {
	1484	my $u = shift; # get a Unicode codepoint
	1485	my $b = pack('n', $u); # to UTF-16BE
	1486	my $s = your_unicode_to_sjis_converter($b); # convert
	1487	my $n = unpack('n', $s); # convert sjis to short
	1488	return $n; # return the primary weight
	1489	},
	1490
	1491	The return value may be a list containing zero or more of
	1492	an arrayref, an integer, or C<undef>.
	1493
	1494	ex. ignores all CJK unified ideographs.
	1495
	1496	overrideCJK => sub {()}, # CODEREF returning empty list
	1497
	1498	# where ->eq("Pe\x{4E00}rl", "Perl") is true
	1499	# as U+4E00 is a CJK unified ideograph and to be ignorable.
	1500
	1501	If a false value (including C<undef>) is passed, C<overrideCJK>
	1502	has no effect.
	1503	C<$Collator-E<gt>change(overrideCJK =E<gt> 0)> resets the old one.
	1504
	1505	But assignment of weight for CJK unified ideographs
	1506	in C<table> or C<entry> is still valid.
	1507	If C<undef> is passed explicitly as the value for this key,
	1508	weights for CJK unified ideographs are treated as undefined.
	1509	However when C<UCA_Version> E<gt> 8, C<(overrideCJK =E<gt> undef)>
	1510	has no special meaning.
	1511
	1512	B<Note:> In addition to them, 12 CJK compatibility ideographs (C<U+FA0E>,
	1513	C<U+FA0F>, C<U+FA11>, C<U+FA13>, C<U+FA14>, C<U+FA1F>, C<U+FA21>, C<U+FA23>,
	1514	C<U+FA24>, C<U+FA27>, C<U+FA28>, C<U+FA29>) are also treated as CJK unified
	1515	ideographs. But they can't be overridden via C<overrideCJK> when you use
	1516	DUCET, as the table includes weights for them. C<table> or C<entry> has
	1517	priority over C<overrideCJK>.
	1518
	1519	=item overrideHangul
	1520
	1521	-- see 7.1 Derived Collation Elements, UTS #10.
	1522
	1523	By default, Hangul syllables are decomposed into Hangul Jamo,
	1524	even if C<(normalization =E<gt> undef)>.
	1525	But the mapping of Hangul syllables may be overridden.
	1526
	1527	This parameter works like C<overrideCJK>, so see there for examples.
	1528
	1529	If you want to override the mapping of Hangul syllables,
	1530	NFD and NFKD are not appropriate, since NFD and NFKD will decompose
	1531	Hangul syllables before overriding. FCD may decompose Hangul syllables
	1532	as the case may be.
	1533
	1534	If a false value (but not C<undef>) is passed, C<overrideHangul>
	1535	has no effect.
	1536	C<$Collator-E<gt>change(overrideHangul =E<gt> 0)> resets the old one.
	1537
	1538	If C<undef> is passed explicitly as the value for this key,
	1539	weight for Hangul syllables is treated as undefined
	1540	without decomposition into Hangul Jamo.
	1541	But definition of weight for Hangul syllables
	1542	in C<table> or C<entry> is still valid.
	1543
	1544	=item overrideOut
	1545
	1546	-- see 7.1.1 Handling Ill-Formed Code Unit Sequences, UTS #10.
	1547
	1548	Perl seems to allow out-of-range values (greater than 0x10FFFF).
	1549	By default, out-of-range values are replaced with C<U+FFFD>
	1550	(REPLACEMENT CHARACTER) when C<UCA_Version> E<gt>= 22,
	1551	or ignored when C<UCA_Version> E<lt>= 20.
	1552
	1553	When C<UCA_Version> E<gt>= 22, the weights of out-of-range values
	1554	can be overridden. Though C<table> or C<entry> are available for them,
	1555	out-of-range values are too many.
	1556
	1557	C<overrideOut> can perform it algorithmically.
	1558	This parameter works like C<overrideCJK>, so see there for examples.
	1559
	1560	ex. ignores all out-of-range values.
	1561
	1562	overrideOut => sub {()}, # CODEREF returning empty list
	1563
	1564	If a false value (including C<undef>) is passed, C<overrideOut>
	1565	has no effect.
	1566	C<$Collator-E<gt>change(overrideOut =E<gt> 0)> resets the old one.
	1567
	1568	B<NOTE ABOUT U+FFFD:>
	1569
	1570	UCA recommends that out-of-range values should not be ignored for security
	1571	reasons. Say, C<"pe\x{110000}rl"> should not be equal to C<"perl">.
	1572	However, C<U+FFFD> is wrongly mapped to a variable collation element
	1573	in DUCET for Unicode 6.0.0 to 6.2.0, that means out-of-range values will be
	1574	ignored when C<variable> isn't C<Non-ignorable>.
	1575
	1576	The mapping of C<U+FFFD> is corrected in Unicode 6.3.0.
	1577	see L<http://www.unicode.org/reports/tr10/tr10-28.html#Trailing_Weights>
	1578	(7.1.4 Trailing Weights). Such a correction is reproduced by this.
	1579
	1580	overrideOut => sub { 0xFFFD }, # CODEREF returning a very large integer
	1581
	1582	This workaround is unnecessary since Unicode 6.3.0.
	1583
	1584	=item preprocess
	1585
	1586	-- see 5.4 Preprocessing, UTS #10.
	1587
	1588	If specified, the coderef is used to preprocess each string
	1589	before the formation of sort keys.
	1590
	1591	ex. dropping English articles, such as "a" or "the".
	1592	Then, "the pen" is before "a pencil".
	1593
	1594	preprocess => sub {
	1595	my $str = shift;
	1596	$str =~ s/\b(?:an?\|the)\s+//gi;
	1597	return $str;
	1598	},
	1599
	1600	C<preprocess> is performed before C<normalization> (if defined).
	1601
	1602	ex. decoding strings in a legacy encoding such as shift-jis:
	1603
	1604	$sjis_collator = Unicode::Collate->new(
	1605	preprocess => \&your_shiftjis_to_unicode_decoder,
	1606	);
	1607	@result = $sjis_collator->sort(@shiftjis_strings);
	1608
	1609	B<Note:> Strings returned from the coderef will be interpreted
	1610	according to Perl's Unicode support. See L<perlunicode>,
	1611	L<perluniintro>, L<perlunitut>, L<perlunifaq>, L<utf8>.
	1612
	1613	=item rearrange
	1614
	1615	-- see 3.5 Rearrangement, UTS #10.
	1616
	1617	Characters that are not coded in logical order and to be rearranged.
	1618	If C<UCA_Version> is equal to or less than 11, default is:
	1619
	1620	rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
	1621
	1622	If you want to disallow any rearrangement, pass C<undef> or C<[]>
	1623	(a reference to empty list) as the value for this key.
	1624
	1625	If C<UCA_Version> is equal to or greater than 14, default is C<[]>
	1626	(i.e. no rearrangement).
	1627
	1628	B<According to the version 9 of UCA, this parameter shall not be used;
	1629	but it is not warned at present.>
	1630
	1631	=item rewrite
	1632
	1633	If specified, the coderef is used to rewrite lines in C<table> or C<entry>.
	1634	The coderef will get each line, and then should return a rewritten line
	1635	according to the UCA file format.
	1636	If the coderef returns an empty line, the line will be skipped.
	1637
	1638	e.g. any primary ignorable characters into tertiary ignorable:
	1639
	1640	rewrite => sub {
	1641	my $line = shift;
	1642	$line =~ s/\[\.0000\..{4}\..{4}\./[.0000.0000.0000./g;
	1643	return $line;
	1644	},
	1645
	1646	This example shows rewriting weights. C<rewrite> is allowed to
	1647	affect code points, weights, and the name.
	1648
	1649	B<NOTE>: C<table> is available to use another table file;
	1650	preparing a modified table once would be more efficient than
	1651	rewriting lines on reading an unmodified table every time.
	1652
	1653	=item suppress
	1654
	1655	-- see 3.12 Special-Purpose Commands, UTS #35 (LDML) Part 5: Collation.
	1656
	1657	Contractions beginning with the specified characters are suppressed,
	1658	even if those contractions are defined in C<table>.
	1659
	1660	An example for Russian and some languages using the Cyrillic script:
	1661
	1662	suppress => [0x0400..0x0417, 0x041A..0x0437, 0x043A..0x045F],
	1663
	1664	where 0x0400 stands for C<U+0400>, CYRILLIC CAPITAL LETTER IE WITH GRAVE.
	1665
	1666	B<NOTE>: Contractions via C<entry> will not be suppressed.
	1667
	1668	=item table
	1669
	1670	-- see 3.8 Default Unicode Collation Element Table, UTS #10.
	1671
	1672	You can use another collation element table if desired.
	1673
	1674	The table file should locate in the F<Unicode/Collate> directory
	1675	on C<@INC>. Say, if the filename is F<Foo.txt>,
	1676	the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>.
	1677
	1678	By default, F<allkeys.txt> (as the filename of DUCET) is used.
	1679	If you will prepare your own table file, any name other than F<allkeys.txt>
	1680	may be better to avoid namespace conflict.
	1681
	1682	B<NOTE>: When XSUB is used, the DUCET is compiled on building this
	1683	module, and it may save time at the run time.
	1684	Explicit saying C<(table =E<gt> 'allkeys.txt')>, or using another table,
	1685	or using C<ignoreChar>, C<ignoreName>, C<undefChar>, C<undefName> or
	1686	C<rewrite> will prevent this module from using the compiled DUCET.
	1687
	1688	If C<undef> is passed explicitly as the value for this key,
	1689	no file is read (but you can define collation elements via C<entry>).
	1690
	1691	A typical way to define a collation element table
	1692	without any file of table:
	1693
	1694	$onlyABC = Unicode::Collate->new(
	1695	table => undef,
	1696	entry => << 'ENTRIES',
	1697	0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A
	1698	0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A
	1699	0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B
	1700	0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B
	1701	0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C
	1702	0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C
	1703	ENTRIES
	1704	);
	1705
	1706	If C<ignoreName> or C<undefName> is used, character names should be
	1707	specified as a comment (following C<#>) on each line.
	1708
	1709	=item undefChar
	1710
	1711	=item undefName
	1712
	1713	-- see 6.3.3 Reducing the Repertoire, UTS #10.
	1714
	1715	Undefines the collation element as if it were unassigned in the C<table>.
	1716	This reduces the size of the table.
	1717	If an unassigned character appears in the string to be collated,
	1718	the sort key is made from its codepoint
	1719	as a single-character collation element,
	1720	as it is greater than any other assigned collation elements
	1721	(in the codepoint order among the unassigned characters).
	1722	But, it'd be better to ignore characters
	1723	unfamiliar to you and maybe never used.
	1724
	1725	Through C<undefChar>, any character matching C<qr/$undefChar/>
	1726	will be undefined. Through C<undefName>, any character whose name
	1727	(given in the C<table> file as a comment) matches C<qr/$undefName/>
	1728	will be undefined.
	1729
	1730	ex. Collation weights for beyond-BMP characters are not stored in object:
	1731
	1732	undefChar => qr/[^\0-\x{fffd}]/,
	1733
	1734	=item upper_before_lower
	1735
	1736	-- see 6.6 Case Comparisons, UTS #10.
	1737
	1738	By default, lowercase is before uppercase.
	1739	If the parameter is made true, this is reversed.
	1740
	1741	B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase
	1742	distinctions must occur in level 3, and their weights at level 3 must be
	1743	same as those mentioned in 7.3.1, UTS #10.
	1744	If you define your collation elements which differs from this requirement,
	1745	this parameter doesn't work validly.
	1746
	1747	=item variable
	1748
	1749	-- see 3.6 Variable Weighting, UTS #10.
	1750
	1751	This key allows for variable weighting of variable collation elements,
	1752	which are marked with an ASTERISK in the table
	1753	(NOTE: Many punctuation marks and symbols are variable in F<allkeys.txt>).
	1754
	1755	variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'.
	1756
	1757	These names are case-insensitive.
	1758	By default (if specification is omitted), 'shifted' is adopted.
	1759
	1760	'Blanked' Variable elements are made ignorable at levels 1 through 3;
	1761	considered at the 4th level.
	1762
	1763	'Non-Ignorable' Variable elements are not reset to ignorable.
	1764
	1765	'Shifted' Variable elements are made ignorable at levels 1 through 3
	1766	their level 4 weight is replaced by the old level 1 weight.
	1767	Level 4 weight for Non-Variable elements is 0xFFFF.
	1768
	1769	'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level
	1770	are trimmed.
	1771
	1772	=back
	1773
	1774	=head2 Methods for Collation
	1775
	1776	=over 4
	1777
	1778	=item C<@sorted = $Collator-E<gt>sort(@not_sorted)>
	1779
	1780	Sorts a list of strings.
	1781
	1782	=item C<$result = $Collator-E<gt>cmp($a, $b)>
	1783
	1784	Returns 1 (when C<$a> is greater than C<$b>)
	1785	or 0 (when C<$a> is equal to C<$b>)
	1786	or -1 (when C<$a> is less than C<$b>).
	1787
	1788	=item C<$result = $Collator-E<gt>eq($a, $b)>
	1789
	1790	=item C<$result = $Collator-E<gt>ne($a, $b)>
	1791
	1792	=item C<$result = $Collator-E<gt>lt($a, $b)>
	1793
	1794	=item C<$result = $Collator-E<gt>le($a, $b)>
	1795
	1796	=item C<$result = $Collator-E<gt>gt($a, $b)>
	1797
	1798	=item C<$result = $Collator-E<gt>ge($a, $b)>
	1799
	1800	They works like the same name operators as theirs.
	1801
	1802	eq : whether $a is equal to $b.
	1803	ne : whether $a is not equal to $b.
	1804	lt : whether $a is less than $b.
	1805	le : whether $a is less than $b or equal to $b.
	1806	gt : whether $a is greater than $b.
	1807	ge : whether $a is greater than $b or equal to $b.
	1808
	1809	=item C<$sortKey = $Collator-E<gt>getSortKey($string)>
	1810
	1811	-- see 4.3 Form Sort Key, UTS #10.
	1812
	1813	Returns a sort key.
	1814
	1815	You compare the sort keys using a binary comparison
	1816	and get the result of the comparison of the strings using UCA.
	1817
	1818	$Collator->getSortKey($a) cmp $Collator->getSortKey($b)
	1819
	1820	is equivalent to
	1821
	1822	$Collator->cmp($a, $b)
	1823
	1824	=item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)>
	1825
	1826	Converts a sorting key into its representation form.
	1827	If C<UCA_Version> is 8, the output is slightly different.
	1828
	1829	use Unicode::Collate;
	1830	my $c = Unicode::Collate->new();
	1831	print $c->viewSortKey("Perl"),"\n";
	1832
	1833	# output:
	1834	# [0B67 0A65 0B7F 0B03 \| 0020 0020 0020 0020 \| 0008 0002 0002 0002 \| FFFF FFFF FFFF FFFF]
	1835	# Level 1 Level 2 Level 3 Level 4
	1836
	1837	=back
	1838
	1839	=head2 Methods for Searching
	1840
	1841	The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work
	1842	like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively,
	1843	but they are not aware of any pattern, but only a literal substring.
	1844
	1845	B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true
	1846	for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>,
	1847	C<subst>, C<gsubst>) is croaked, as the position and the length might
	1848	differ from those on the specified string.
	1849
	1850	C<rearrange> and C<hangul_terminator> parameters are neglected.
	1851	C<katakana_before_hiragana> and C<upper_before_lower> don't affect
	1852	matching and searching, as it doesn't matter whether greater or less.
	1853
	1854	=over 4
	1855
	1856	=item C<$position = $Collator-E<gt>index($string, $substring[, $position])>
	1857
	1858	=item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])>
	1859
	1860	If C<$substring> matches a part of C<$string>, returns
	1861	the position of the first occurrence of the matching part in scalar context;
	1862	in list context, returns a two-element list of
	1863	the position and the length of the matching part.
	1864
	1865	If C<$substring> does not match any part of C<$string>,
	1866	returns C<-1> in scalar context and
	1867	an empty list in list context.
	1868
	1869	e.g. when the content of C<$str> is C<"Ich mu>E<szlig>C< studieren Perl.">,
	1870	you say the following where C<$sub> is C<"M>E<uuml>C<SS">,
	1871
	1872	my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
	1873	# (normalization => undef) is REQUIRED.
	1874	my $match;
	1875	if (my($pos,$len) = $Collator->index($str, $sub)) {
	1876	$match = substr($str, $pos, $len);
	1877	}
	1878
	1879	and get C<"mu>E<szlig>C<"> in C<$match>, since C<"mu>E<szlig>C<">
	1880	is primary equal to C<"M>E<uuml>C<SS">.
	1881
	1882	=item C<$match_ref = $Collator-E<gt>match($string, $substring)>
	1883
	1884	=item C<($match) = $Collator-E<gt>match($string, $substring)>
	1885
	1886	If C<$substring> matches a part of C<$string>, in scalar context, returns
	1887	B<a reference to> the first occurrence of the matching part
	1888	(C<$match_ref> is always true if matches,
	1889	since every reference is B<true>);
	1890	in list context, returns the first occurrence of the matching part.
	1891
	1892	If C<$substring> does not match any part of C<$string>,
	1893	returns C<undef> in scalar context and
	1894	an empty list in list context.
	1895
	1896	e.g.
	1897
	1898	if ($match_ref = $Collator->match($str, $sub)) { # scalar context
	1899	print "matches [$$match_ref].\n";
	1900	} else {
	1901	print "doesn't match.\n";
	1902	}
	1903
	1904	or
	1905
	1906	if (($match) = $Collator->match($str, $sub)) { # list context
	1907	print "matches [$match].\n";
	1908	} else {
	1909	print "doesn't match.\n";
	1910	}
	1911
	1912	=item C<@match = $Collator-E<gt>gmatch($string, $substring)>
	1913
	1914	If C<$substring> matches a part of C<$string>, returns
	1915	all the matching parts (or matching count in scalar context).
	1916
	1917	If C<$substring> does not match any part of C<$string>,
	1918	returns an empty list.
	1919
	1920	=item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)>
	1921
	1922	If C<$substring> matches a part of C<$string>,
	1923	the first occurrence of the matching part is replaced by C<$replacement>
	1924	(C<$string> is modified) and C<$count> (always equals to C<1>) is returned.
	1925
	1926	C<$replacement> can be a C<CODEREF>,
	1927	taking the matching part as an argument,
	1928	and returning a string to replace the matching part
	1929	(a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>).
	1930
	1931	=item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)>
	1932
	1933	If C<$substring> matches a part of C<$string>,
	1934	all the occurrences of the matching part are replaced by C<$replacement>
	1935	(C<$string> is modified) and C<$count> is returned.
	1936
	1937	C<$replacement> can be a C<CODEREF>,
	1938	taking the matching part as an argument,
	1939	and returning a string to replace the matching part
	1940	(a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>).
	1941
	1942	e.g.
	1943
	1944	my $Collator = Unicode::Collate->new( normalization => undef, level => 1 );
	1945	# (normalization => undef) is REQUIRED.
	1946	my $str = "Camel donkey zebra came\x{301}l CAMEL horse cam\0e\0l...";
	1947	$Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" });
	1948
	1949	# now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cam\0e\0l</b>...";
	1950	# i.e., all the camels are made bold-faced.
	1951
	1952	Examples: levels and ignore_level2 - what does camel match?
	1953	---------------------------------------------------------------------------
	1954	level ignore_level2 \| camel Camel came\x{301}l c-a-m-e-l cam\0e\0l
	1955	-----------------------\|---------------------------------------------------
	1956	1 false \| yes yes yes yes yes
	1957	2 false \| yes yes no yes yes
	1958	3 false \| yes no no yes yes
	1959	4 false \| yes no no no yes
	1960	-----------------------\|---------------------------------------------------
	1961	1 true \| yes yes yes yes yes
	1962	2 true \| yes yes yes yes yes
	1963	3 true \| yes no yes yes yes
	1964	4 true \| yes no yes no yes
	1965	---------------------------------------------------------------------------
	1966	note: if variable => non-ignorable, camel doesn't match c-a-m-e-l
	1967	at any level.
	1968
	1969	=back
	1970
	1971	=head2 Other Methods
	1972
	1973	=over 4
	1974
	1975	=item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)>
	1976
	1977	=item C<$modified_collator = $Collator-E<gt>change(%new_tailoring)>
	1978
	1979	Changes the value of specified keys and returns the changed part.
	1980
	1981	$Collator = Unicode::Collate->new(level => 4);
	1982
	1983	$Collator->eq("perl", "PERL"); # false
	1984
	1985	%old = $Collator->change(level => 2); # returns (level => 4).
	1986
	1987	$Collator->eq("perl", "PERL"); # true
	1988
	1989	$Collator->change(%old); # returns (level => 2).
	1990
	1991	$Collator->eq("perl", "PERL"); # false
	1992
	1993	Not all C<(key,value)>s are allowed to be changed.
	1994	See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>.
	1995
	1996	In the scalar context, returns the modified collator
	1997	(but it is B<not> a clone from the original).
	1998
	1999	$Collator->change(level => 2)->eq("perl", "PERL"); # true
	2000
	2001	$Collator->eq("perl", "PERL"); # true; now max level is 2nd.
	2002
	2003	$Collator->change(level => 4)->eq("perl", "PERL"); # false
	2004
	2005	=item C<$version = $Collator-E<gt>version()>
	2006
	2007	Returns the version number (a string) of the Unicode Standard
	2008	which the C<table> file used by the collator object is based on.
	2009	If the table does not include a version line (starting with C<@version>),
	2010	returns C<"unknown">.
	2011
	2012	=item C<UCA_Version()>
	2013
	2014	Returns the revision number of UTS #10 this module consults,
	2015	that should correspond with the DUCET incorporated.
	2016
	2017	=item C<Base_Unicode_Version()>
	2018
	2019	Returns the version number of UTS #10 this module consults,
	2020	that should correspond with the DUCET incorporated.
	2021
	2022	=back
	2023
	2024	=head1 EXPORT
	2025
	2026	No method will be exported.
	2027
	2028	=head1 INSTALL
	2029
	2030	Though this module can be used without any C<table> file,
	2031	to use this module easily, it is recommended to install a table file
	2032	in the UCA format, by copying it under the directory
	2033	<a place in @INC>/Unicode/Collate.
	2034
	2035	The most preferable one is "The Default Unicode Collation Element Table"
	2036	(aka DUCET), available from the Unicode Consortium's website:
	2037
	2038	http://www.unicode.org/Public/UCA/
	2039
	2040	http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version)
	2041
	2042	If DUCET is not installed, it is recommended to copy the file
	2043	from http://www.unicode.org/Public/UCA/latest/allkeys.txt
	2044	to <a place in @INC>/Unicode/Collate/allkeys.txt
	2045	manually.
	2046
	2047	=head1 CAVEATS
	2048
	2049	=over 4
	2050
	2051	=item Normalization
	2052
	2053	Use of the C<normalization> parameter requires the B<Unicode::Normalize>
	2054	module (see L<Unicode::Normalize>).
	2055
	2056	If you need not it (say, in the case when you need not
	2057	handle any combining characters),
	2058	assign C<(normalization =E<gt> undef)> explicitly.
	2059
	2060	-- see 6.5 Avoiding Normalization, UTS #10.
	2061
	2062	=item Conformance Test
	2063
	2064	The Conformance Test for the UCA is available
	2065	under L<http://www.unicode.org/Public/UCA/>.
	2066
	2067	For F<CollationTest_SHIFTED.txt>,
	2068	a collator via C<Unicode::Collate-E<gt>new( )> should be used;
	2069	for F<CollationTest_NON_IGNORABLE.txt>, a collator via
	2070	C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>.
	2071
	2072	If C<UCA_Version> is 26 or later, the C<identical> level is preferred;
	2073	C<Unicode::Collate-E<gt>new(identical =E<gt> 1)> and
	2074	C<Unicode::Collate-E<gt>new(identical =E<gt> 1,>
	2075	C<variable =E<gt> "non-ignorable", level =E<gt> 3)> should be used.
	2076
	2077	B<Unicode::Normalize is required to try The Conformance Test.>
	2078
	2079	=back
	2080
	2081	=head1 AUTHOR, COPYRIGHT AND LICENSE
	2082
	2083	The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki,
	2084	<SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2016,
	2085	SADAHIRO Tomoyuki. Japan. All rights reserved.
	2086
	2087	This module is free software; you can redistribute it and/or
	2088	modify it under the same terms as Perl itself.
	2089
	2090	The file Unicode/Collate/allkeys.txt was copied verbatim
	2091	from L<http://www.unicode.org/Public/UCA/8.0.0/allkeys.txt>.
	2092	For this file, Copyright (c) 2001-2015 Unicode, Inc.; distributed
	2093	under the Terms of Use in L<http://www.unicode.org/terms_of_use.html>
	2094
	2095	=head1 SEE ALSO
	2096
	2097	=over 4
	2098
	2099	=item Unicode Collation Algorithm - UTS #10
	2100
	2101	L<http://www.unicode.org/reports/tr10/>
	2102
	2103	=item The Default Unicode Collation Element Table (DUCET)
	2104
	2105	L<http://www.unicode.org/Public/UCA/latest/allkeys.txt>
	2106
	2107	=item The conformance test for the UCA
	2108
	2109	L<http://www.unicode.org/Public/UCA/latest/CollationTest.html>
	2110
	2111	L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip>
	2112
	2113	=item Hangul Syllable Type
	2114
	2115	L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt>
	2116
	2117	=item Unicode Normalization Forms - UAX #15
	2118
	2119	L<http://www.unicode.org/reports/tr15/>
	2120
	2121	=item Unicode Locale Data Markup Language (LDML) - UTS #35
	2122
	2123	L<http://www.unicode.org/reports/tr35/>
	2124
	2125	=back
	2126
	2127	=cut