perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	package Unicode::UCD;
	2
	3	use strict;
	4	use warnings;
	5	no warnings 'surrogate'; # surrogates can be inputs to this
	6	use charnames ();
	7
	8	our $VERSION = '0.53';
	9
	10	require Exporter;
	11
	12	our @ISA = qw(Exporter);
	13
	14	our @EXPORT_OK = qw(charinfo
	15	charblock charscript
	16	charblocks charscripts
	17	charinrange
	18	general_categories bidi_types
	19	compexcl
	20	casefold all_casefolds casespec
	21	namedseq
	22	num
	23	prop_aliases
	24	prop_value_aliases
	25	prop_invlist
	26	prop_invmap
	27	search_invlist
	28	MAX_CP
	29	);
	30
	31	use Carp;
	32
	33	=head1 NAME
	34
	35	Unicode::UCD - Unicode character database
	36
	37	=head1 SYNOPSIS
	38
	39	use Unicode::UCD 'charinfo';
	40	my $charinfo = charinfo($codepoint);
	41
	42	use Unicode::UCD 'casefold';
	43	my $casefold = casefold(0xFB00);
	44
	45	use Unicode::UCD 'all_casefolds';
	46	my $all_casefolds_ref = all_casefolds();
	47
	48	use Unicode::UCD 'casespec';
	49	my $casespec = casespec(0xFB00);
	50
	51	use Unicode::UCD 'charblock';
	52	my $charblock = charblock($codepoint);
	53
	54	use Unicode::UCD 'charscript';
	55	my $charscript = charscript($codepoint);
	56
	57	use Unicode::UCD 'charblocks';
	58	my $charblocks = charblocks();
	59
	60	use Unicode::UCD 'charscripts';
	61	my $charscripts = charscripts();
	62
	63	use Unicode::UCD qw(charscript charinrange);
	64	my $range = charscript($script);
	65	print "looks like $script\n" if charinrange($range, $codepoint);
	66
	67	use Unicode::UCD qw(general_categories bidi_types);
	68	my $categories = general_categories();
	69	my $types = bidi_types();
	70
	71	use Unicode::UCD 'prop_aliases';
	72	my @space_names = prop_aliases("space");
	73
	74	use Unicode::UCD 'prop_value_aliases';
	75	my @gc_punct_names = prop_value_aliases("Gc", "Punct");
	76
	77	use Unicode::UCD 'prop_invlist';
	78	my @puncts = prop_invlist("gc=punctuation");
	79
	80	use Unicode::UCD 'prop_invmap';
	81	my ($list_ref, $map_ref, $format, $missing)
	82	= prop_invmap("General Category");
	83
	84	use Unicode::UCD 'search_invlist';
	85	my $index = search_invlist(\@invlist, $code_point);
	86
	87	use Unicode::UCD 'compexcl';
	88	my $compexcl = compexcl($codepoint);
	89
	90	use Unicode::UCD 'namedseq';
	91	my $namedseq = namedseq($named_sequence_name);
	92
	93	my $unicode_version = Unicode::UCD::UnicodeVersion();
	94
	95	my $convert_to_numeric =
	96	Unicode::UCD::num("\N{RUMI DIGIT ONE}\N{RUMI DIGIT TWO}");
	97
	98	=head1 DESCRIPTION
	99
	100	The Unicode::UCD module offers a series of functions that
	101	provide a simple interface to the Unicode
	102	Character Database.
	103
	104	=head2 code point argument
	105
	106	Some of the functions are called with a I<code point argument>, which is either
	107	a decimal or a hexadecimal scalar designating a Unicode code point, or C<U+>
	108	followed by hexadecimals designating a Unicode code point. In other words, if
	109	you want a code point to be interpreted as a hexadecimal number, you must
	110	prefix it with either C<0x> or C<U+>, because a string like e.g. C<123> will be
	111	interpreted as a decimal code point.
	112
	113	Examples:
	114
	115	223 # Decimal 223
	116	0223 # Hexadecimal 223 (= 547 decimal)
	117	0xDF # Hexadecimal DF (= 223 decimal
	118	U+DF # Hexadecimal DF
	119
	120	Note that the largest code point in Unicode is U+10FFFF.
	121
	122	=cut
	123
	124	my $BLOCKSFH;
	125	my $VERSIONFH;
	126	my $CASEFOLDFH;
	127	my $CASESPECFH;
	128	my $NAMEDSEQFH;
	129	my $v_unicode_version; # v-string.
	130
	131	sub openunicode {
	132	my ($rfh, @path) = @_;
	133	my $f;
	134	unless (defined $$rfh) {
	135	for my $d (@INC) {
	136	use File::Spec;
	137	$f = File::Spec->catfile($d, "unicore", @path);
	138	last if open($$rfh, $f);
	139	undef $f;
	140	}
	141	croak __PACKAGE__, ": failed to find ",
	142	File::Spec->catfile(@path), " in @INC"
	143	unless defined $f;
	144	}
	145	return $f;
	146	}
	147
	148	sub _dclone ($) { # Use Storable::dclone if available; otherwise emulate it.
	149
	150	use if defined &DynaLoader::boot_DynaLoader, Storable => qw(dclone);
	151
	152	return dclone(shift) if defined &dclone;
	153
	154	my $arg = shift;
	155	my $type = ref $arg;
	156	return $arg unless $type; # No deep cloning needed for scalars
	157
	158	if ($type eq 'ARRAY') {
	159	my @return;
	160	foreach my $element (@$arg) {
	161	push @return, &_dclone($element);
	162	}
	163	return \@return;
	164	}
	165	elsif ($type eq 'HASH') {
	166	my %return;
	167	foreach my $key (keys %$arg) {
	168	$return{$key} = &_dclone($arg->{$key});
	169	}
	170	return \%return;
	171	}
	172	else {
	173	croak "_dclone can't handle " . $type;
	174	}
	175	}
	176
	177	=head2 B<charinfo()>
	178
	179	use Unicode::UCD 'charinfo';
	180
	181	my $charinfo = charinfo(0x41);
	182
	183	This returns information about the input L</code point argument>
	184	as a reference to a hash of fields as defined by the Unicode
	185	standard. If the L</code point argument> is not assigned in the standard
	186	(i.e., has the general category C<Cn> meaning C<Unassigned>)
	187	or is a non-character (meaning it is guaranteed to never be assigned in
	188	the standard),
	189	C<undef> is returned.
	190
	191	Fields that aren't applicable to the particular code point argument exist in the
	192	returned hash, and are empty.
	193
	194	The keys in the hash with the meanings of their values are:
	195
	196	=over
	197
	198	=item B<code>
	199
	200	the input L</code point argument> expressed in hexadecimal, with leading zeros
	201	added if necessary to make it contain at least four hexdigits
	202
	203	=item B<name>
	204
	205	name of I<code>, all IN UPPER CASE.
	206	Some control-type code points do not have names.
	207	This field will be empty for C<Surrogate> and C<Private Use> code points,
	208	and for the others without a name,
	209	it will contain a description enclosed in angle brackets, like
	210	C<E<lt>controlE<gt>>.
	211
	212
	213	=item B<category>
	214
	215	The short name of the general category of I<code>.
	216	This will match one of the keys in the hash returned by L</general_categories()>.
	217
	218	The L</prop_value_aliases()> function can be used to get all the synonyms
	219	of the category name.
	220
	221	=item B<combining>
	222
	223	the combining class number for I<code> used in the Canonical Ordering Algorithm.
	224	For Unicode 5.1, this is described in Section 3.11 C<Canonical Ordering Behavior>
	225	available at
	226	L<http://www.unicode.org/versions/Unicode5.1.0/>
	227
	228	The L</prop_value_aliases()> function can be used to get all the synonyms
	229	of the combining class number.
	230
	231	=item B<bidi>
	232
	233	bidirectional type of I<code>.
	234	This will match one of the keys in the hash returned by L</bidi_types()>.
	235
	236	The L</prop_value_aliases()> function can be used to get all the synonyms
	237	of the bidi type name.
	238
	239	=item B<decomposition>
	240
	241	is empty if I<code> has no decomposition; or is one or more codes
	242	(separated by spaces) that, taken in order, represent a decomposition for
	243	I<code>. Each has at least four hexdigits.
	244	The codes may be preceded by a word enclosed in angle brackets then a space,
	245	like C<E<lt>compatE<gt> >, giving the type of decomposition
	246
	247	This decomposition may be an intermediate one whose components are also
	248	decomposable. Use L<Unicode::Normalize> to get the final decomposition.
	249
	250	=item B<decimal>
	251
	252	if I<code> is a decimal digit this is its integer numeric value
	253
	254	=item B<digit>
	255
	256	if I<code> represents some other digit-like number, this is its integer
	257	numeric value
	258
	259	=item B<numeric>
	260
	261	if I<code> represents a whole or rational number, this is its numeric value.
	262	Rational values are expressed as a string like C<1/4>.
	263
	264	=item B<mirrored>
	265
	266	C<Y> or C<N> designating if I<code> is mirrored in bidirectional text
	267
	268	=item B<unicode10>
	269
	270	name of I<code> in the Unicode 1.0 standard if one
	271	existed for this code point and is different from the current name
	272
	273	=item B<comment>
	274
	275	As of Unicode 6.0, this is always empty.
	276
	277	=item B<upper>
	278
	279	is empty if there is no single code point uppercase mapping for I<code>
	280	(its uppercase mapping is itself);
	281	otherwise it is that mapping expressed as at least four hexdigits.
	282	(L</casespec()> should be used in addition to B<charinfo()>
	283	for case mappings when the calling program can cope with multiple code point
	284	mappings.)
	285
	286	=item B<lower>
	287
	288	is empty if there is no single code point lowercase mapping for I<code>
	289	(its lowercase mapping is itself);
	290	otherwise it is that mapping expressed as at least four hexdigits.
	291	(L</casespec()> should be used in addition to B<charinfo()>
	292	for case mappings when the calling program can cope with multiple code point
	293	mappings.)
	294
	295	=item B<title>
	296
	297	is empty if there is no single code point titlecase mapping for I<code>
	298	(its titlecase mapping is itself);
	299	otherwise it is that mapping expressed as at least four hexdigits.
	300	(L</casespec()> should be used in addition to B<charinfo()>
	301	for case mappings when the calling program can cope with multiple code point
	302	mappings.)
	303
	304	=item B<block>
	305
	306	the block I<code> belongs to (used in C<\p{Blk=...}>).
	307	See L</Blocks versus Scripts>.
	308
	309
	310	=item B<script>
	311
	312	the script I<code> belongs to.
	313	See L</Blocks versus Scripts>.
	314
	315	=back
	316
	317	Note that you cannot do (de)composition and casing based solely on the
	318	I<decomposition>, I<combining>, I<lower>, I<upper>, and I<title> fields;
	319	you will need also the L</compexcl()>, and L</casespec()> functions.
	320
	321	=cut
	322
	323	# NB: This function is nearly duplicated in charnames.pm
	324	sub _getcode {
	325	my $arg = shift;
	326
	327	if ($arg =~ /^[1-9]\d*$/) {
	328	return $arg;
	329	} elsif ($arg =~ /^(?:[Uu]\+\|0[xX])?([[:xdigit:]]+)$/) {
	330	return hex($1);
	331	}
	332
	333	return;
	334	}
	335
	336	# Populated by _num. Converts real number back to input rational
	337	my %real_to_rational;
	338
	339	# To store the contents of files found on disk.
	340	my @BIDIS;
	341	my @CATEGORIES;
	342	my @DECOMPOSITIONS;
	343	my @NUMERIC_TYPES;
	344	my %SIMPLE_LOWER;
	345	my %SIMPLE_TITLE;
	346	my %SIMPLE_UPPER;
	347	my %UNICODE_1_NAMES;
	348	my %ISO_COMMENT;
	349
	350	sub charinfo {
	351
	352	# This function has traditionally mimicked what is in UnicodeData.txt,
	353	# warts and all. This is a re-write that avoids UnicodeData.txt so that
	354	# it can be removed to save disk space. Instead, this assembles
	355	# information gotten by other methods that get data from various other
	356	# files. It uses charnames to get the character name; and various
	357	# mktables tables.
	358
	359	use feature 'unicode_strings';
	360
	361	# Will fail if called under minitest
	362	use if defined &DynaLoader::boot_DynaLoader, "Unicode::Normalize" => qw(getCombinClass NFD);
	363
	364	my $arg = shift;
	365	my $code = _getcode($arg);
	366	croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code;
	367
	368	# Non-unicode implies undef.
	369	return if $code > 0x10FFFF;
	370
	371	my %prop;
	372	my $char = chr($code);
	373
	374	@CATEGORIES =_read_table("To/Gc.pl") unless @CATEGORIES;
	375	$prop{'category'} = _search(\@CATEGORIES, 0, $#CATEGORIES, $code)
	376	// $utf8::SwashInfo{'ToGc'}{'missing'};
	377
	378	return if $prop{'category'} eq 'Cn'; # Unassigned code points are undef
	379
	380	$prop{'code'} = sprintf "%04X", $code;
	381	$prop{'name'} = ($char =~ /\p{Cntrl}/) ? '<control>'
	382	: (charnames::viacode($code) // "");
	383
	384	$prop{'combining'} = getCombinClass($code);
	385
	386	@BIDIS =_read_table("To/Bc.pl") unless @BIDIS;
	387	$prop{'bidi'} = _search(\@BIDIS, 0, $#BIDIS, $code)
	388	// $utf8::SwashInfo{'ToBc'}{'missing'};
	389
	390	# For most code points, we can just read in "unicore/Decomposition.pl", as
	391	# its contents are exactly what should be output. But that file doesn't
	392	# contain the data for the Hangul syllable decompositions, which can be
	393	# algorithmically computed, and NFD() does that, so we call NFD() for
	394	# those. We can't use NFD() for everything, as it does a complete
	395	# recursive decomposition, and what this function has always done is to
	396	# return what's in UnicodeData.txt which doesn't show that recursiveness.
	397	# Fortunately, the NFD() of the Hanguls doesn't have any recursion
	398	# issues.
	399	# Having no decomposition implies an empty field; otherwise, all but
	400	# "Canonical" imply a compatible decomposition, and the type is prefixed
	401	# to that, as it is in UnicodeData.txt
	402	UnicodeVersion() unless defined $v_unicode_version;
	403	if ($v_unicode_version ge v2.0.0 && $char =~ /\p{Block=Hangul_Syllables}/) {
	404	# The code points of the decomposition are output in standard Unicode
	405	# hex format, separated by blanks.
	406	$prop{'decomposition'} = join " ", map { sprintf("%04X", $_)}
	407	unpack "U*", NFD($char);
	408	}
	409	else {
	410	@DECOMPOSITIONS = _read_table("Decomposition.pl")
	411	unless @DECOMPOSITIONS;
	412	$prop{'decomposition'} = _search(\@DECOMPOSITIONS, 0, $#DECOMPOSITIONS,
	413	$code) // "";
	414	}
	415
	416	# Can use num() to get the numeric values, if any.
	417	if (! defined (my $value = num($char))) {
	418	$prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = "";
	419	}
	420	else {
	421	if ($char =~ /\d/) {
	422	$prop{'decimal'} = $prop{'digit'} = $prop{'numeric'} = $value;
	423	}
	424	else {
	425
	426	# For non-decimal-digits, we have to read in the Numeric type
	427	# to distinguish them. It is not just a matter of integer vs.
	428	# rational, as some whole number values are not considered digits,
	429	# e.g., TAMIL NUMBER TEN.
	430	$prop{'decimal'} = "";
	431
	432	@NUMERIC_TYPES =_read_table("To/Nt.pl") unless @NUMERIC_TYPES;
	433	if ((_search(\@NUMERIC_TYPES, 0, $#NUMERIC_TYPES, $code) // "")
	434	eq 'Digit')
	435	{
	436	$prop{'digit'} = $prop{'numeric'} = $value;
	437	}
	438	else {
	439	$prop{'digit'} = "";
	440	$prop{'numeric'} = $real_to_rational{$value} // $value;
	441	}
	442	}
	443	}
	444
	445	$prop{'mirrored'} = ($char =~ /\p{Bidi_Mirrored}/) ? 'Y' : 'N';
	446
	447	%UNICODE_1_NAMES =_read_table("To/Na1.pl", "use_hash") unless %UNICODE_1_NAMES;
	448	$prop{'unicode10'} = $UNICODE_1_NAMES{$code} // "";
	449
	450	UnicodeVersion() unless defined $v_unicode_version;
	451	if ($v_unicode_version ge v6.0.0) {
	452	$prop{'comment'} = "";
	453	}
	454	else {
	455	%ISO_COMMENT = _read_table("To/Isc.pl", "use_hash") unless %ISO_COMMENT;
	456	$prop{'comment'} = (defined $ISO_COMMENT{$code})
	457	? $ISO_COMMENT{$code}
	458	: "";
	459	}
	460
	461	%SIMPLE_UPPER = _read_table("To/Uc.pl", "use_hash") unless %SIMPLE_UPPER;
	462	$prop{'upper'} = (defined $SIMPLE_UPPER{$code})
	463	? sprintf("%04X", $SIMPLE_UPPER{$code})
	464	: "";
	465
	466	%SIMPLE_LOWER = _read_table("To/Lc.pl", "use_hash") unless %SIMPLE_LOWER;
	467	$prop{'lower'} = (defined $SIMPLE_LOWER{$code})
	468	? sprintf("%04X", $SIMPLE_LOWER{$code})
	469	: "";
	470
	471	%SIMPLE_TITLE = _read_table("To/Tc.pl", "use_hash") unless %SIMPLE_TITLE;
	472	$prop{'title'} = (defined $SIMPLE_TITLE{$code})
	473	? sprintf("%04X", $SIMPLE_TITLE{$code})
	474	: "";
	475
	476	$prop{block} = charblock($code);
	477	$prop{script} = charscript($code);
	478	return \%prop;
	479	}
	480
	481	sub _search { # Binary search in a [[lo,hi,prop],[...],...] table.
	482	my ($table, $lo, $hi, $code) = @_;
	483
	484	return if $lo > $hi;
	485
	486	my $mid = int(($lo+$hi) / 2);
	487
	488	if ($table->[$mid]->[0] < $code) {
	489	if ($table->[$mid]->[1] >= $code) {
	490	return $table->[$mid]->[2];
	491	} else {
	492	_search($table, $mid + 1, $hi, $code);
	493	}
	494	} elsif ($table->[$mid]->[0] > $code) {
	495	_search($table, $lo, $mid - 1, $code);
	496	} else {
	497	return $table->[$mid]->[2];
	498	}
	499	}
	500
	501	sub _read_table ($;$) {
	502
	503	# Returns the contents of the mktables generated table file located at $1
	504	# in the form of either an array of arrays or a hash, depending on if the
	505	# optional second parameter is true (for hash return) or not. In the case
	506	# of a hash return, each key is a code point, and its corresponding value
	507	# is what the table gives as the code point's corresponding value. In the
	508	# case of an array return, each outer array denotes a range with [0] the
	509	# start point of that range; [1] the end point; and [2] the value that
	510	# every code point in the range has. The hash return is useful for fast
	511	# lookup when the table contains only single code point ranges. The array
	512	# return takes much less memory when there are large ranges.
	513	#
	514	# This function has the side effect of setting
	515	# $utf8::SwashInfo{$property}{'format'} to be the mktables format of the
	516	# table; and
	517	# $utf8::SwashInfo{$property}{'missing'} to be the value for all entries
	518	# not listed in the table.
	519	# where $property is the Unicode property name, preceded by 'To' for map
	520	# properties., e.g., 'ToSc'.
	521	#
	522	# Table entries look like one of:
	523	# 0000 0040 Common # [65]
	524	# 00AA Latin
	525
	526	my $table = shift;
	527	my $return_hash = shift;
	528	$return_hash = 0 unless defined $return_hash;
	529	my @return;
	530	my %return;
	531	local $_;
	532	my $list = do "unicore/$table";
	533
	534	# Look up if this property requires adjustments, which we do below if it
	535	# does.
	536	require "unicore/Heavy.pl";
	537	my $property = $table =~ s/\.pl//r;
	538	$property = $utf8::file_to_swash_name{$property};
	539	my $to_adjust = defined $property
	540	&& $utf8::SwashInfo{$property}{'format'} eq 'a';
	541
	542	for (split /^/m, $list) {
	543	my ($start, $end, $value) = / ^ (.+?) \t (.*?) \t (.+?)
	544	\s* ( \# .* )? # Optional comment
	545	$ /x;
	546	my $decimal_start = hex $start;
	547	my $decimal_end = ($end eq "") ? $decimal_start : hex $end;
	548	if ($return_hash) {
	549	foreach my $i ($decimal_start .. $decimal_end) {
	550	$return{$i} = ($to_adjust)
	551	? $value + $i - $decimal_start
	552	: $value;
	553	}
	554	}
	555	elsif (! $to_adjust
	556	&& @return
	557	&& $return[-1][1] == $decimal_start - 1
	558	&& $return[-1][2] eq $value)
	559	{
	560	# If this is merely extending the previous range, do just that.
	561	$return[-1]->[1] = $decimal_end;
	562	}
	563	else {
	564	push @return, [ $decimal_start, $decimal_end, $value ];
	565	}
	566	}
	567	return ($return_hash) ? %return : @return;
	568	}
	569
	570	sub charinrange {
	571	my ($range, $arg) = @_;
	572	my $code = _getcode($arg);
	573	croak __PACKAGE__, "::charinrange: unknown code '$arg'"
	574	unless defined $code;
	575	_search($range, 0, $#$range, $code);
	576	}
	577
	578	=head2 B<charblock()>
	579
	580	use Unicode::UCD 'charblock';
	581
	582	my $charblock = charblock(0x41);
	583	my $charblock = charblock(1234);
	584	my $charblock = charblock(0x263a);
	585	my $charblock = charblock("U+263a");
	586
	587	my $range = charblock('Armenian');
	588
	589	With a L</code point argument> charblock() returns the I<block> the code point
	590	belongs to, e.g. C<Basic Latin>. The old-style block name is returned (see
	591	L</Old-style versus new-style block names>).
	592	If the code point is unassigned, this returns the block it would belong to if
	593	it were assigned. (If the Unicode version being used is so early as to not
	594	have blocks, all code points are considered to be in C<No_Block>.)
	595
	596	See also L</Blocks versus Scripts>.
	597
	598	If supplied with an argument that can't be a code point, charblock() tries to
	599	do the opposite and interpret the argument as an old-style block name. The
	600	return value
	601	is a I<range set> with one range: an anonymous list with a single element that
	602	consists of another anonymous list whose first element is the first code point
	603	in the block, and whose second (and final) element is the final code point in
	604	the block. (The extra list consisting of just one element is so that the same
	605	program logic can be used to handle both this return, and the return from
	606	L</charscript()> which can have multiple ranges.) You can test whether a code
	607	point is in a range using the L</charinrange()> function. If the argument is
	608	not a known block, C<undef> is returned.
	609
	610	=cut
	611
	612	my @BLOCKS;
	613	my %BLOCKS;
	614
	615	sub _charblocks {
	616
	617	# Can't read from the mktables table because it loses the hyphens in the
	618	# original.
	619	unless (@BLOCKS) {
	620	UnicodeVersion() unless defined $v_unicode_version;
	621	if ($v_unicode_version lt v2.0.0) {
	622	my $subrange = [ 0, 0x10FFFF, 'No_Block' ];
	623	push @BLOCKS, $subrange;
	624	push @{$BLOCKS{'No_Block'}}, $subrange;
	625	}
	626	elsif (openunicode(\$BLOCKSFH, "Blocks.txt")) {
	627	local $_;
	628	local $/ = "\n";
	629	while (<$BLOCKSFH>) {
	630	if (/^([0-9A-F]+)\.\.([0-9A-F]+);\s+(.+)/) {
	631	my ($lo, $hi) = (hex($1), hex($2));
	632	my $subrange = [ $lo, $hi, $3 ];
	633	push @BLOCKS, $subrange;
	634	push @{$BLOCKS{$3}}, $subrange;
	635	}
	636	}
	637	close($BLOCKSFH);
	638	}
	639	}
	640	}
	641
	642	sub charblock {
	643	my $arg = shift;
	644
	645	_charblocks() unless @BLOCKS;
	646
	647	my $code = _getcode($arg);
	648
	649	if (defined $code) {
	650	my $result = _search(\@BLOCKS, 0, $#BLOCKS, $code);
	651	return $result if defined $result;
	652	return 'No_Block';
	653	}
	654	elsif (exists $BLOCKS{$arg}) {
	655	return _dclone $BLOCKS{$arg};
	656	}
	657	}
	658
	659	=head2 B<charscript()>
	660
	661	use Unicode::UCD 'charscript';
	662
	663	my $charscript = charscript(0x41);
	664	my $charscript = charscript(1234);
	665	my $charscript = charscript("U+263a");
	666
	667	my $range = charscript('Thai');
	668
	669	With a L</code point argument> charscript() returns the I<script> the
	670	code point belongs to, e.g. C<Latin>, C<Greek>, C<Han>.
	671	If the code point is unassigned or the Unicode version being used is so early
	672	that it doesn't have scripts, this function returns C<"Unknown">.
	673
	674	If supplied with an argument that can't be a code point, charscript() tries
	675	to do the opposite and interpret the argument as a script name. The
	676	return value is a I<range set>: an anonymous list of lists that contain
	677	I<start-of-range>, I<end-of-range> code point pairs. You can test whether a
	678	code point is in a range set using the L</charinrange()> function. If the
	679	argument is not a known script, C<undef> is returned.
	680
	681	See also L</Blocks versus Scripts>.
	682
	683	=cut
	684
	685	my @SCRIPTS;
	686	my %SCRIPTS;
	687
	688	sub _charscripts {
	689	unless (@SCRIPTS) {
	690	UnicodeVersion() unless defined $v_unicode_version;
	691	if ($v_unicode_version lt v3.1.0) {
	692	push @SCRIPTS, [ 0, 0x10FFFF, 'Unknown' ];
	693	}
	694	else {
	695	@SCRIPTS =_read_table("To/Sc.pl");
	696	}
	697	}
	698	foreach my $entry (@SCRIPTS) {
	699	$entry->[2] =~ s/(_\w)/\L$1/g; # Preserve old-style casing
	700	push @{$SCRIPTS{$entry->[2]}}, $entry;
	701	}
	702	}
	703
	704	sub charscript {
	705	my $arg = shift;
	706
	707	_charscripts() unless @SCRIPTS;
	708
	709	my $code = _getcode($arg);
	710
	711	if (defined $code) {
	712	my $result = _search(\@SCRIPTS, 0, $#SCRIPTS, $code);
	713	return $result if defined $result;
	714	return $utf8::SwashInfo{'ToSc'}{'missing'};
	715	} elsif (exists $SCRIPTS{$arg}) {
	716	return _dclone $SCRIPTS{$arg};
	717	}
	718
	719	return;
	720	}
	721
	722	=head2 B<charblocks()>
	723
	724	use Unicode::UCD 'charblocks';
	725
	726	my $charblocks = charblocks();
	727
	728	charblocks() returns a reference to a hash with the known block names
	729	as the keys, and the code point ranges (see L</charblock()>) as the values.
	730
	731	The names are in the old-style (see L</Old-style versus new-style block
	732	names>).
	733
	734	L<prop_invmap("block")\|/prop_invmap()> can be used to get this same data in a
	735	different type of data structure.
	736
	737	See also L</Blocks versus Scripts>.
	738
	739	=cut
	740
	741	sub charblocks {
	742	_charblocks() unless %BLOCKS;
	743	return _dclone \%BLOCKS;
	744	}
	745
	746	=head2 B<charscripts()>
	747
	748	use Unicode::UCD 'charscripts';
	749
	750	my $charscripts = charscripts();
	751
	752	charscripts() returns a reference to a hash with the known script
	753	names as the keys, and the code point ranges (see L</charscript()>) as
	754	the values.
	755
	756	L<prop_invmap("script")\|/prop_invmap()> can be used to get this same data in a
	757	different type of data structure.
	758
	759	See also L</Blocks versus Scripts>.
	760
	761	=cut
	762
	763	sub charscripts {
	764	_charscripts() unless %SCRIPTS;
	765	return _dclone \%SCRIPTS;
	766	}
	767
	768	=head2 B<charinrange()>
	769
	770	In addition to using the C<\p{Blk=...}> and C<\P{Blk=...}> constructs, you
	771	can also test whether a code point is in the I<range> as returned by
	772	L</charblock()> and L</charscript()> or as the values of the hash returned
	773	by L</charblocks()> and L</charscripts()> by using charinrange():
	774
	775	use Unicode::UCD qw(charscript charinrange);
	776
	777	$range = charscript('Hiragana');
	778	print "looks like hiragana\n" if charinrange($range, $codepoint);
	779
	780	=cut
	781
	782	my %GENERAL_CATEGORIES =
	783	(
	784	'L' => 'Letter',
	785	'LC' => 'CasedLetter',
	786	'Lu' => 'UppercaseLetter',
	787	'Ll' => 'LowercaseLetter',
	788	'Lt' => 'TitlecaseLetter',
	789	'Lm' => 'ModifierLetter',
	790	'Lo' => 'OtherLetter',
	791	'M' => 'Mark',
	792	'Mn' => 'NonspacingMark',
	793	'Mc' => 'SpacingMark',
	794	'Me' => 'EnclosingMark',
	795	'N' => 'Number',
	796	'Nd' => 'DecimalNumber',
	797	'Nl' => 'LetterNumber',
	798	'No' => 'OtherNumber',
	799	'P' => 'Punctuation',
	800	'Pc' => 'ConnectorPunctuation',
	801	'Pd' => 'DashPunctuation',
	802	'Ps' => 'OpenPunctuation',
	803	'Pe' => 'ClosePunctuation',
	804	'Pi' => 'InitialPunctuation',
	805	'Pf' => 'FinalPunctuation',
	806	'Po' => 'OtherPunctuation',
	807	'S' => 'Symbol',
	808	'Sm' => 'MathSymbol',
	809	'Sc' => 'CurrencySymbol',
	810	'Sk' => 'ModifierSymbol',
	811	'So' => 'OtherSymbol',
	812	'Z' => 'Separator',
	813	'Zs' => 'SpaceSeparator',
	814	'Zl' => 'LineSeparator',
	815	'Zp' => 'ParagraphSeparator',
	816	'C' => 'Other',
	817	'Cc' => 'Control',
	818	'Cf' => 'Format',
	819	'Cs' => 'Surrogate',
	820	'Co' => 'PrivateUse',
	821	'Cn' => 'Unassigned',
	822	);
	823
	824	sub general_categories {
	825	return _dclone \%GENERAL_CATEGORIES;
	826	}
	827
	828	=head2 B<general_categories()>
	829
	830	use Unicode::UCD 'general_categories';
	831
	832	my $categories = general_categories();
	833
	834	This returns a reference to a hash which has short
	835	general category names (such as C<Lu>, C<Nd>, C<Zs>, C<S>) as keys and long
	836	names (such as C<UppercaseLetter>, C<DecimalNumber>, C<SpaceSeparator>,
	837	C<Symbol>) as values. The hash is reversible in case you need to go
	838	from the long names to the short names. The general category is the
	839	one returned from
	840	L</charinfo()> under the C<category> key.
	841
	842	The L</prop_value_aliases()> function can be used to get all the synonyms of
	843	the category name.
	844
	845	=cut
	846
	847	my %BIDI_TYPES =
	848	(
	849	'L' => 'Left-to-Right',
	850	'LRE' => 'Left-to-Right Embedding',
	851	'LRO' => 'Left-to-Right Override',
	852	'R' => 'Right-to-Left',
	853	'AL' => 'Right-to-Left Arabic',
	854	'RLE' => 'Right-to-Left Embedding',
	855	'RLO' => 'Right-to-Left Override',
	856	'PDF' => 'Pop Directional Format',
	857	'EN' => 'European Number',
	858	'ES' => 'European Number Separator',
	859	'ET' => 'European Number Terminator',
	860	'AN' => 'Arabic Number',
	861	'CS' => 'Common Number Separator',
	862	'NSM' => 'Non-Spacing Mark',
	863	'BN' => 'Boundary Neutral',
	864	'B' => 'Paragraph Separator',
	865	'S' => 'Segment Separator',
	866	'WS' => 'Whitespace',
	867	'ON' => 'Other Neutrals',
	868	);
	869
	870	=head2 B<bidi_types()>
	871
	872	use Unicode::UCD 'bidi_types';
	873
	874	my $categories = bidi_types();
	875
	876	This returns a reference to a hash which has the short
	877	bidi (bidirectional) type names (such as C<L>, C<R>) as keys and long
	878	names (such as C<Left-to-Right>, C<Right-to-Left>) as values. The
	879	hash is reversible in case you need to go from the long names to the
	880	short names. The bidi type is the one returned from
	881	L</charinfo()>
	882	under the C<bidi> key. For the exact meaning of the various bidi classes
	883	the Unicode TR9 is recommended reading:
	884	L<http://www.unicode.org/reports/tr9/>
	885	(as of Unicode 5.0.0)
	886
	887	The L</prop_value_aliases()> function can be used to get all the synonyms of
	888	the bidi type name.
	889
	890	=cut
	891
	892	sub bidi_types {
	893	return _dclone \%BIDI_TYPES;
	894	}
	895
	896	=head2 B<compexcl()>
	897
	898	use Unicode::UCD 'compexcl';
	899
	900	my $compexcl = compexcl(0x09dc);
	901
	902	This routine returns C<undef> if the Unicode version being used is so early
	903	that it doesn't have this property. It is included for backwards
	904	compatibility, but as of Perl 5.12 and more modern Unicode versions, for
	905	most purposes it is probably more convenient to use one of the following
	906	instead:
	907
	908	my $compexcl = chr(0x09dc) =~ /\p{Comp_Ex};
	909	my $compexcl = chr(0x09dc) =~ /\p{Full_Composition_Exclusion};
	910
	911	or even
	912
	913	my $compexcl = chr(0x09dc) =~ /\p{CE};
	914	my $compexcl = chr(0x09dc) =~ /\p{Composition_Exclusion};
	915
	916	The first two forms return B<true> if the L</code point argument> should not
	917	be produced by composition normalization. For the final two forms to return
	918	B<true>, it is additionally required that this fact not otherwise be
	919	determinable from the Unicode data base.
	920
	921	This routine behaves identically to the final two forms. That is,
	922	it does not return B<true> if the code point has a decomposition
	923	consisting of another single code point, nor if its decomposition starts
	924	with a code point whose combining class is non-zero. Code points that meet
	925	either of these conditions should also not be produced by composition
	926	normalization, which is probably why you should use the
	927	C<Full_Composition_Exclusion> property instead, as shown above.
	928
	929	The routine returns B<false> otherwise.
	930
	931	=cut
	932
	933	sub compexcl {
	934	my $arg = shift;
	935	my $code = _getcode($arg);
	936	croak __PACKAGE__, "::compexcl: unknown code '$arg'"
	937	unless defined $code;
	938
	939	UnicodeVersion() unless defined $v_unicode_version;
	940	return if $v_unicode_version lt v3.0.0;
	941
	942	no warnings "non_unicode"; # So works on non-Unicode code points
	943	return chr($code) =~ /\p{Composition_Exclusion}/;
	944	}
	945
	946	=head2 B<casefold()>
	947
	948	use Unicode::UCD 'casefold';
	949
	950	my $casefold = casefold(0xDF);
	951	if (defined $casefold) {
	952	my @full_fold_hex = split / /, $casefold->{'full'};
	953	my $full_fold_string =
	954	join "", map {chr(hex($_))} @full_fold_hex;
	955	my @turkic_fold_hex =
	956	split / /, ($casefold->{'turkic'} ne "")
	957	? $casefold->{'turkic'}
	958	: $casefold->{'full'};
	959	my $turkic_fold_string =
	960	join "", map {chr(hex($_))} @turkic_fold_hex;
	961	}
	962	if (defined $casefold && $casefold->{'simple'} ne "") {
	963	my $simple_fold_hex = $casefold->{'simple'};
	964	my $simple_fold_string = chr(hex($simple_fold_hex));
	965	}
	966
	967	This returns the (almost) locale-independent case folding of the
	968	character specified by the L</code point argument>. (Starting in Perl v5.16,
	969	the core function C<fc()> returns the C<full> mapping (described below)
	970	faster than this does, and for entire strings.)
	971
	972	If there is no case folding for the input code point, C<undef> is returned.
	973
	974	If there is a case folding for that code point, a reference to a hash
	975	with the following fields is returned:
	976
	977	=over
	978
	979	=item B<code>
	980
	981	the input L</code point argument> expressed in hexadecimal, with leading zeros
	982	added if necessary to make it contain at least four hexdigits
	983
	984	=item B<full>
	985
	986	one or more codes (separated by spaces) that, taken in order, give the
	987	code points for the case folding for I<code>.
	988	Each has at least four hexdigits.
	989
	990	=item B<simple>
	991
	992	is empty, or is exactly one code with at least four hexdigits which can be used
	993	as an alternative case folding when the calling program cannot cope with the
	994	fold being a sequence of multiple code points. If I<full> is just one code
	995	point, then I<simple> equals I<full>. If there is no single code point folding
	996	defined for I<code>, then I<simple> is the empty string. Otherwise, it is an
	997	inferior, but still better-than-nothing alternative folding to I<full>.
	998
	999	=item B<mapping>
	1000
	1001	is the same as I<simple> if I<simple> is not empty, and it is the same as I<full>
	1002	otherwise. It can be considered to be the simplest possible folding for
	1003	I<code>. It is defined primarily for backwards compatibility.
	1004
	1005	=item B<status>
	1006
	1007	is C<C> (for C<common>) if the best possible fold is a single code point
	1008	(I<simple> equals I<full> equals I<mapping>). It is C<S> if there are distinct
	1009	folds, I<simple> and I<full> (I<mapping> equals I<simple>). And it is C<F> if
	1010	there is only a I<full> fold (I<mapping> equals I<full>; I<simple> is empty).
	1011	Note that this
	1012	describes the contents of I<mapping>. It is defined primarily for backwards
	1013	compatibility.
	1014
	1015	For Unicode versions between 3.1 and 3.1.1 inclusive, I<status> can also be
	1016	C<I> which is the same as C<C> but is a special case for dotted uppercase I and
	1017	dotless lowercase i:
	1018
	1019	=over
	1020
	1021	=item Z<>B<*> If you use this C<I> mapping
	1022
	1023	the result is case-insensitive,
	1024	but dotless and dotted I's are not distinguished
	1025
	1026	=item Z<>B<*> If you exclude this C<I> mapping
	1027
	1028	the result is not fully case-insensitive, but
	1029	dotless and dotted I's are distinguished
	1030
	1031	=back
	1032
	1033	=item B<turkic>
	1034
	1035	contains any special folding for Turkic languages. For versions of Unicode
	1036	starting with 3.2, this field is empty unless I<code> has a different folding
	1037	in Turkic languages, in which case it is one or more codes (separated by
	1038	spaces) that, taken in order, give the code points for the case folding for
	1039	I<code> in those languages.
	1040	Each code has at least four hexdigits.
	1041	Note that this folding does not maintain canonical equivalence without
	1042	additional processing.
	1043
	1044	For Unicode versions between 3.1 and 3.1.1 inclusive, this field is empty unless
	1045	there is a
	1046	special folding for Turkic languages, in which case I<status> is C<I>, and
	1047	I<mapping>, I<full>, I<simple>, and I<turkic> are all equal.
	1048
	1049	=back
	1050
	1051	Programs that want complete generality and the best folding results should use
	1052	the folding contained in the I<full> field. But note that the fold for some
	1053	code points will be a sequence of multiple code points.
	1054
	1055	Programs that can't cope with the fold mapping being multiple code points can
	1056	use the folding contained in the I<simple> field, with the loss of some
	1057	generality. In Unicode 5.1, about 7% of the defined foldings have no single
	1058	code point folding.
	1059
	1060	The I<mapping> and I<status> fields are provided for backwards compatibility for
	1061	existing programs. They contain the same values as in previous versions of
	1062	this function.
	1063
	1064	Locale is not completely independent. The I<turkic> field contains results to
	1065	use when the locale is a Turkic language.
	1066
	1067	For more information about case mappings see
	1068	L<http://www.unicode.org/unicode/reports/tr21>
	1069
	1070	=cut
	1071
	1072	my %CASEFOLD;
	1073
	1074	sub _casefold {
	1075	unless (%CASEFOLD) { # Populate the hash
	1076	my ($full_invlist_ref, $full_invmap_ref, undef, $default)
	1077	= prop_invmap('Case_Folding');
	1078
	1079	# Use the recipe given in the prop_invmap() pod to convert the
	1080	# inversion map into the hash.
	1081	for my $i (0 .. @$full_invlist_ref - 1 - 1) {
	1082	next if $full_invmap_ref->[$i] == $default;
	1083	my $adjust = -1;
	1084	for my $j ($full_invlist_ref->[$i] .. $full_invlist_ref->[$i+1] -1) {
	1085	$adjust++;
	1086	if (! ref $full_invmap_ref->[$i]) {
	1087
	1088	# This is a single character mapping
	1089	$CASEFOLD{$j}{'status'} = 'C';
	1090	$CASEFOLD{$j}{'simple'}
	1091	= $CASEFOLD{$j}{'full'}
	1092	= $CASEFOLD{$j}{'mapping'}
	1093	= sprintf("%04X", $full_invmap_ref->[$i] + $adjust);
	1094	$CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
	1095	$CASEFOLD{$j}{'turkic'} = "";
	1096	}
	1097	else { # prop_invmap ensures that $adjust is 0 for a ref
	1098	$CASEFOLD{$j}{'status'} = 'F';
	1099	$CASEFOLD{$j}{'full'}
	1100	= $CASEFOLD{$j}{'mapping'}
	1101	= join " ", map { sprintf "%04X", $_ }
	1102	@{$full_invmap_ref->[$i]};
	1103	$CASEFOLD{$j}{'simple'} = "";
	1104	$CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
	1105	$CASEFOLD{$j}{'turkic'} = "";
	1106	}
	1107	}
	1108	}
	1109
	1110	# We have filled in the full mappings above, assuming there were no
	1111	# simple ones for the ones with multi-character maps. Now, we find
	1112	# and fix the cases where that assumption was false.
	1113	(my ($simple_invlist_ref, $simple_invmap_ref, undef), $default)
	1114	= prop_invmap('Simple_Case_Folding');
	1115	for my $i (0 .. @$simple_invlist_ref - 1 - 1) {
	1116	next if $simple_invmap_ref->[$i] == $default;
	1117	my $adjust = -1;
	1118	for my $j ($simple_invlist_ref->[$i]
	1119	.. $simple_invlist_ref->[$i+1] -1)
	1120	{
	1121	$adjust++;
	1122	next if $CASEFOLD{$j}{'status'} eq 'C';
	1123	$CASEFOLD{$j}{'status'} = 'S';
	1124	$CASEFOLD{$j}{'simple'}
	1125	= $CASEFOLD{$j}{'mapping'}
	1126	= sprintf("%04X", $simple_invmap_ref->[$i] + $adjust);
	1127	$CASEFOLD{$j}{'code'} = sprintf("%04X", $j);
	1128	$CASEFOLD{$j}{'turkic'} = "";
	1129	}
	1130	}
	1131
	1132	# We hard-code in the turkish rules
	1133	UnicodeVersion() unless defined $v_unicode_version;
	1134	if ($v_unicode_version ge v3.2.0) {
	1135
	1136	# These two code points should already have regular entries, so
	1137	# just fill in the turkish fields
	1138	$CASEFOLD{ord('I')}{'turkic'} = '0131';
	1139	$CASEFOLD{0x130}{'turkic'} = sprintf "%04X", ord('i');
	1140	}
	1141	elsif ($v_unicode_version ge v3.1.0) {
	1142
	1143	# These two code points don't have entries otherwise.
	1144	$CASEFOLD{0x130}{'code'} = '0130';
	1145	$CASEFOLD{0x131}{'code'} = '0131';
	1146	$CASEFOLD{0x130}{'status'} = $CASEFOLD{0x131}{'status'} = 'I';
	1147	$CASEFOLD{0x130}{'turkic'}
	1148	= $CASEFOLD{0x130}{'mapping'}
	1149	= $CASEFOLD{0x130}{'full'}
	1150	= $CASEFOLD{0x130}{'simple'}
	1151	= $CASEFOLD{0x131}{'turkic'}
	1152	= $CASEFOLD{0x131}{'mapping'}
	1153	= $CASEFOLD{0x131}{'full'}
	1154	= $CASEFOLD{0x131}{'simple'}
	1155	= sprintf "%04X", ord('i');
	1156	}
	1157	}
	1158	}
	1159
	1160	sub casefold {
	1161	my $arg = shift;
	1162	my $code = _getcode($arg);
	1163	croak __PACKAGE__, "::casefold: unknown code '$arg'"
	1164	unless defined $code;
	1165
	1166	_casefold() unless %CASEFOLD;
	1167
	1168	return $CASEFOLD{$code};
	1169	}
	1170
	1171	=head2 B<all_casefolds()>
	1172
	1173
	1174	use Unicode::UCD 'all_casefolds';
	1175
	1176	my $all_folds_ref = all_casefolds();
	1177	foreach my $char_with_casefold (sort { $a <=> $b }
	1178	keys %$all_folds_ref)
	1179	{
	1180	printf "%04X:", $char_with_casefold;
	1181	my $casefold = $all_folds_ref->{$char_with_casefold};
	1182
	1183	# Get folds for $char_with_casefold
	1184
	1185	my @full_fold_hex = split / /, $casefold->{'full'};
	1186	my $full_fold_string =
	1187	join "", map {chr(hex($_))} @full_fold_hex;
	1188	print " full=", join " ", @full_fold_hex;
	1189	my @turkic_fold_hex =
	1190	split / /, ($casefold->{'turkic'} ne "")
	1191	? $casefold->{'turkic'}
	1192	: $casefold->{'full'};
	1193	my $turkic_fold_string =
	1194	join "", map {chr(hex($_))} @turkic_fold_hex;
	1195	print "; turkic=", join " ", @turkic_fold_hex;
	1196	if (defined $casefold && $casefold->{'simple'} ne "") {
	1197	my $simple_fold_hex = $casefold->{'simple'};
	1198	my $simple_fold_string = chr(hex($simple_fold_hex));
	1199	print "; simple=$simple_fold_hex";
	1200	}
	1201	print "\n";
	1202	}
	1203
	1204	This returns all the case foldings in the current version of Unicode in the
	1205	form of a reference to a hash. Each key to the hash is the decimal
	1206	representation of a Unicode character that has a casefold to other than
	1207	itself. The casefold of a semi-colon is itself, so it isn't in the hash;
	1208	likewise for a lowercase "a", but there is an entry for a capital "A". The
	1209	hash value for each key is another hash, identical to what is returned by
	1210	L</casefold()> if called with that code point as its argument. So the value
	1211	C<< all_casefolds()->{ord("A")}' >> is equivalent to C<casefold(ord("A"))>;
	1212
	1213	=cut
	1214
	1215	sub all_casefolds () {
	1216	_casefold() unless %CASEFOLD;
	1217	return _dclone \%CASEFOLD;
	1218	}
	1219
	1220	=head2 B<casespec()>
	1221
	1222	use Unicode::UCD 'casespec';
	1223
	1224	my $casespec = casespec(0xFB00);
	1225
	1226	This returns the potentially locale-dependent case mappings of the L</code point
	1227	argument>. The mappings may be longer than a single code point (which the basic
	1228	Unicode case mappings as returned by L</charinfo()> never are).
	1229
	1230	If there are no case mappings for the L</code point argument>, or if all three
	1231	possible mappings (I<lower>, I<title> and I<upper>) result in single code
	1232	points and are locale independent and unconditional, C<undef> is returned
	1233	(which means that the case mappings, if any, for the code point are those
	1234	returned by L</charinfo()>).
	1235
	1236	Otherwise, a reference to a hash giving the mappings (or a reference to a hash
	1237	of such hashes, explained below) is returned with the following keys and their
	1238	meanings:
	1239
	1240	The keys in the bottom layer hash with the meanings of their values are:
	1241
	1242	=over
	1243
	1244	=item B<code>
	1245
	1246	the input L</code point argument> expressed in hexadecimal, with leading zeros
	1247	added if necessary to make it contain at least four hexdigits
	1248
	1249	=item B<lower>
	1250
	1251	one or more codes (separated by spaces) that, taken in order, give the
	1252	code points for the lower case of I<code>.
	1253	Each has at least four hexdigits.
	1254
	1255	=item B<title>
	1256
	1257	one or more codes (separated by spaces) that, taken in order, give the
	1258	code points for the title case of I<code>.
	1259	Each has at least four hexdigits.
	1260
	1261	=item B<upper>
	1262
	1263	one or more codes (separated by spaces) that, taken in order, give the
	1264	code points for the upper case of I<code>.
	1265	Each has at least four hexdigits.
	1266
	1267	=item B<condition>
	1268
	1269	the conditions for the mappings to be valid.
	1270	If C<undef>, the mappings are always valid.
	1271	When defined, this field is a list of conditions,
	1272	all of which must be true for the mappings to be valid.
	1273	The list consists of one or more
	1274	I<locales> (see below)
	1275	and/or I<contexts> (explained in the next paragraph),
	1276	separated by spaces.
	1277	(Other than as used to separate elements, spaces are to be ignored.)
	1278	Case distinctions in the condition list are not significant.
	1279	Conditions preceded by "NON_" represent the negation of the condition.
	1280
	1281	A I<context> is one of those defined in the Unicode standard.
	1282	For Unicode 5.1, they are defined in Section 3.13 C<Default Case Operations>
	1283	available at
	1284	L<http://www.unicode.org/versions/Unicode5.1.0/>.
	1285	These are for context-sensitive casing.
	1286
	1287	=back
	1288
	1289	The hash described above is returned for locale-independent casing, where
	1290	at least one of the mappings has length longer than one. If C<undef> is
	1291	returned, the code point may have mappings, but if so, all are length one,
	1292	and are returned by L</charinfo()>.
	1293	Note that when this function does return a value, it will be for the complete
	1294	set of mappings for a code point, even those whose length is one.
	1295
	1296	If there are additional casing rules that apply only in certain locales,
	1297	an additional key for each will be defined in the returned hash. Each such key
	1298	will be its locale name, defined as a 2-letter ISO 3166 country code, possibly
	1299	followed by a "_" and a 2-letter ISO language code (possibly followed by a "_"
	1300	and a variant code). You can find the lists of all possible locales, see
	1301	L<Locale::Country> and L<Locale::Language>.
	1302	(In Unicode 6.0, the only locales returned by this function
	1303	are C<lt>, C<tr>, and C<az>.)
	1304
	1305	Each locale key is a reference to a hash that has the form above, and gives
	1306	the casing rules for that particular locale, which take precedence over the
	1307	locale-independent ones when in that locale.
	1308
	1309	If the only casing for a code point is locale-dependent, then the returned
	1310	hash will not have any of the base keys, like C<code>, C<upper>, etc., but
	1311	will contain only locale keys.
	1312
	1313	For more information about case mappings see
	1314	L<http://www.unicode.org/unicode/reports/tr21/>
	1315
	1316	=cut
	1317
	1318	my %CASESPEC;
	1319
	1320	sub _casespec {
	1321	unless (%CASESPEC) {
	1322	UnicodeVersion() unless defined $v_unicode_version;
	1323	if ($v_unicode_version lt v2.1.8) {
	1324	%CASESPEC = {};
	1325	}
	1326	elsif (openunicode(\$CASESPECFH, "SpecialCasing.txt")) {
	1327	local $_;
	1328	local $/ = "\n";
	1329	while (<$CASESPECFH>) {
	1330	if (/^([0-9A-F]+); ([0-9A-F]+(?: [0-9A-F]+))?; ([0-9A-F]+(?: [0-9A-F]+))?; ([0-9A-F]+(?: [0-9A-F]+))?; (\w+(?: \w+))?/) {
	1331
	1332	my ($hexcode, $lower, $title, $upper, $condition) =
	1333	($1, $2, $3, $4, $5);
	1334	my $code = hex($hexcode);
	1335
	1336	# In 2.1.8, there were duplicate entries; ignore all but
	1337	# the first one -- there were no conditions in the file
	1338	# anyway.
	1339	if (exists $CASESPEC{$code} && $v_unicode_version ne v2.1.8)
	1340	{
	1341	if (exists $CASESPEC{$code}->{code}) {
	1342	my ($oldlower,
	1343	$oldtitle,
	1344	$oldupper,
	1345	$oldcondition) =
	1346	@{$CASESPEC{$code}}{qw(lower
	1347	title
	1348	upper
	1349	condition)};
	1350	if (defined $oldcondition) {
	1351	my ($oldlocale) =
	1352	($oldcondition =~ /^([a-z][a-z](?:_\S+)?)/);
	1353	delete $CASESPEC{$code};
	1354	$CASESPEC{$code}->{$oldlocale} =
	1355	{ code => $hexcode,
	1356	lower => $oldlower,
	1357	title => $oldtitle,
	1358	upper => $oldupper,
	1359	condition => $oldcondition };
	1360	}
	1361	}
	1362	my ($locale) =
	1363	($condition =~ /^([a-z][a-z](?:_\S+)?)/);
	1364	$CASESPEC{$code}->{$locale} =
	1365	{ code => $hexcode,
	1366	lower => $lower,
	1367	title => $title,
	1368	upper => $upper,
	1369	condition => $condition };
	1370	} else {
	1371	$CASESPEC{$code} =
	1372	{ code => $hexcode,
	1373	lower => $lower,
	1374	title => $title,
	1375	upper => $upper,
	1376	condition => $condition };
	1377	}
	1378	}
	1379	}
	1380	close($CASESPECFH);
	1381	}
	1382	}
	1383	}
	1384
	1385	sub casespec {
	1386	my $arg = shift;
	1387	my $code = _getcode($arg);
	1388	croak __PACKAGE__, "::casespec: unknown code '$arg'"
	1389	unless defined $code;
	1390
	1391	_casespec() unless %CASESPEC;
	1392
	1393	return ref $CASESPEC{$code} ? _dclone $CASESPEC{$code} : $CASESPEC{$code};
	1394	}
	1395
	1396	=head2 B<namedseq()>
	1397
	1398	use Unicode::UCD 'namedseq';
	1399
	1400	my $namedseq = namedseq("KATAKANA LETTER AINU P");
	1401	my @namedseq = namedseq("KATAKANA LETTER AINU P");
	1402	my %namedseq = namedseq();
	1403
	1404	If used with a single argument in a scalar context, returns the string
	1405	consisting of the code points of the named sequence, or C<undef> if no
	1406	named sequence by that name exists. If used with a single argument in
	1407	a list context, it returns the list of the ordinals of the code points. If used
	1408	with no
	1409	arguments in a list context, returns a hash with the names of the
	1410	named sequences as the keys and the named sequences as strings as
	1411	the values. Otherwise, it returns C<undef> or an empty list depending
	1412	on the context.
	1413
	1414	This function only operates on officially approved (not provisional) named
	1415	sequences.
	1416
	1417	Note that as of Perl 5.14, C<\N{KATAKANA LETTER AINU P}> will insert the named
	1418	sequence into double-quoted strings, and C<charnames::string_vianame("KATAKANA
	1419	LETTER AINU P")> will return the same string this function does, but will also
	1420	operate on character names that aren't named sequences, without you having to
	1421	know which are which. See L<charnames>.
	1422
	1423	=cut
	1424
	1425	my %NAMEDSEQ;
	1426
	1427	sub _namedseq {
	1428	unless (%NAMEDSEQ) {
	1429	if (openunicode(\$NAMEDSEQFH, "Name.pl")) {
	1430	local $_;
	1431	local $/ = "\n";
	1432	while (<$NAMEDSEQFH>) {
	1433	if (/^ [0-9A-F]+ \ /x) {
	1434	chomp;
	1435	my ($sequence, $name) = split /\t/;
	1436	my @s = map { chr(hex($_)) } split(' ', $sequence);
	1437	$NAMEDSEQ{$name} = join("", @s);
	1438	}
	1439	}
	1440	close($NAMEDSEQFH);
	1441	}
	1442	}
	1443	}
	1444
	1445	sub namedseq {
	1446
	1447	# Use charnames::string_vianame() which now returns this information,
	1448	# unless the caller wants the hash returned, in which case we read it in,
	1449	# and thereafter use it instead of calling charnames, as it is faster.
	1450
	1451	my $wantarray = wantarray();
	1452	if (defined $wantarray) {
	1453	if ($wantarray) {
	1454	if (@_ == 0) {
	1455	_namedseq() unless %NAMEDSEQ;
	1456	return %NAMEDSEQ;
	1457	} elsif (@_ == 1) {
	1458	my $s;
	1459	if (%NAMEDSEQ) {
	1460	$s = $NAMEDSEQ{ $_[0] };
	1461	}
	1462	else {
	1463	$s = charnames::string_vianame($_[0]);
	1464	}
	1465	return defined $s ? map { ord($_) } split('', $s) : ();
	1466	}
	1467	} elsif (@_ == 1) {
	1468	return $NAMEDSEQ{ $_[0] } if %NAMEDSEQ;
	1469	return charnames::string_vianame($_[0]);
	1470	}
	1471	}
	1472	return;
	1473	}
	1474
	1475	my %NUMERIC;
	1476
	1477	sub _numeric {
	1478	my @numbers = _read_table("To/Nv.pl");
	1479	foreach my $entry (@numbers) {
	1480	my ($start, $end, $value) = @$entry;
	1481
	1482	# If value contains a slash, convert to decimal, add a reverse hash
	1483	# used by charinfo.
	1484	if ((my @rational = split /\//, $value) == 2) {
	1485	my $real = $rational[0] / $rational[1];
	1486	$real_to_rational{$real} = $value;
	1487	$value = $real;
	1488
	1489	# Should only be single element, but just in case...
	1490	for my $i ($start .. $end) {
	1491	$NUMERIC{$i} = $value;
	1492	}
	1493	}
	1494	else {
	1495	# The values require adjusting, as is in 'a' format
	1496	for my $i ($start .. $end) {
	1497	$NUMERIC{$i} = $value + $i - $start;
	1498	}
	1499	}
	1500	}
	1501
	1502	# Decided unsafe to use these that aren't officially part of the Unicode
	1503	# standard.
	1504	#use Math::Trig;
	1505	#my $pi = acos(-1.0);
	1506	#$NUMERIC{0x03C0} = $pi;
	1507
	1508	# Euler's constant, not to be confused with Euler's number
	1509	#$NUMERIC{0x2107} = 0.57721566490153286060651209008240243104215933593992;
	1510
	1511	# Euler's number
	1512	#$NUMERIC{0x212F} = 2.7182818284590452353602874713526624977572;
	1513
	1514	return;
	1515	}
	1516
	1517	=pod
	1518
	1519	=head2 B<num()>
	1520
	1521	use Unicode::UCD 'num';
	1522
	1523	my $val = num("123");
	1524	my $one_quarter = num("\N{VULGAR FRACTION 1/4}");
	1525
	1526	C<num> returns the numeric value of the input Unicode string; or C<undef> if it
	1527	doesn't think the entire string has a completely valid, safe numeric value.
	1528
	1529	If the string is just one character in length, the Unicode numeric value
	1530	is returned if it has one, or C<undef> otherwise. Note that this need
	1531	not be a whole number. C<num("\N{TIBETAN DIGIT HALF ZERO}")>, for
	1532	example returns -0.5.
	1533
	1534	=cut
	1535
	1536	#A few characters to which Unicode doesn't officially
	1537	#assign a numeric value are considered numeric by C<num>.
	1538	#These are:
	1539
	1540	# EULER CONSTANT 0.5772... (this is NOT Euler's number)
	1541	# SCRIPT SMALL E 2.71828... (this IS Euler's number)
	1542	# GREEK SMALL LETTER PI 3.14159...
	1543
	1544	=pod
	1545
	1546	If the string is more than one character, C<undef> is returned unless
	1547	all its characters are decimal digits (that is, they would match C<\d+>),
	1548	from the same script. For example if you have an ASCII '0' and a Bengali
	1549	'3', mixed together, they aren't considered a valid number, and C<undef>
	1550	is returned. A further restriction is that the digits all have to be of
	1551	the same form. A half-width digit mixed with a full-width one will
	1552	return C<undef>. The Arabic script has two sets of digits; C<num> will
	1553	return C<undef> unless all the digits in the string come from the same
	1554	set.
	1555
	1556	C<num> errs on the side of safety, and there may be valid strings of
	1557	decimal digits that it doesn't recognize. Note that Unicode defines
	1558	a number of "digit" characters that aren't "decimal digit" characters.
	1559	"Decimal digits" have the property that they have a positional value, i.e.,
	1560	there is a units position, a 10's position, a 100's, etc, AND they are
	1561	arranged in Unicode in blocks of 10 contiguous code points. The Chinese
	1562	digits, for example, are not in such a contiguous block, and so Unicode
	1563	doesn't view them as decimal digits, but merely digits, and so C<\d> will not
	1564	match them. A single-character string containing one of these digits will
	1565	have its decimal value returned by C<num>, but any longer string containing
	1566	only these digits will return C<undef>.
	1567
	1568	Strings of multiple sub- and superscripts are not recognized as numbers. You
	1569	can use either of the compatibility decompositions in Unicode::Normalize to
	1570	change these into digits, and then call C<num> on the result.
	1571
	1572	=cut
	1573
	1574	# To handle sub, superscripts, this could if called in list context,
	1575	# consider those, and return the <decomposition> type in the second
	1576	# array element.
	1577
	1578	sub num {
	1579	my $string = $_[0];
	1580
	1581	_numeric unless %NUMERIC;
	1582
	1583	my $length = length($string);
	1584	return $NUMERIC{ord($string)} if $length == 1;
	1585	return if $string =~ /\D/;
	1586	my $first_ord = ord(substr($string, 0, 1));
	1587	my $value = $NUMERIC{$first_ord};
	1588
	1589	# To be a valid decimal number, it should be in a block of 10 consecutive
	1590	# characters, whose values are 0, 1, 2, ... 9. Therefore this digit's
	1591	# value is its offset in that block from the character that means zero.
	1592	my $zero_ord = $first_ord - $value;
	1593
	1594	# Unicode 6.0 instituted the rule that only digits in a consecutive
	1595	# block of 10 would be considered decimal digits. If this is an earlier
	1596	# release, we verify that this first character is a member of such a
	1597	# block. That is, that the block of characters surrounding this one
	1598	# consists of all \d characters whose numeric values are the expected
	1599	# ones.
	1600	UnicodeVersion() unless defined $v_unicode_version;
	1601	if ($v_unicode_version lt v6.0.0) {
	1602	for my $i (0 .. 9) {
	1603	my $ord = $zero_ord + $i;
	1604	return unless chr($ord) =~ /\d/;
	1605	my $numeric = $NUMERIC{$ord};
	1606	return unless defined $numeric;
	1607	return unless $numeric == $i;
	1608	}
	1609	}
	1610
	1611	for my $i (1 .. $length -1) {
	1612
	1613	# Here we know either by verifying, or by fact of the first character
	1614	# being a \d in Unicode 6.0 or later, that any character between the
	1615	# character that means 0, and 9 positions above it must be \d, and
	1616	# must have its value correspond to its offset from the zero. Any
	1617	# characters outside these 10 do not form a legal number for this
	1618	# function.
	1619	my $ord = ord(substr($string, $i, 1));
	1620	my $digit = $ord - $zero_ord;
	1621	return unless $digit >= 0 && $digit <= 9;
	1622	$value = $value * 10 + $digit;
	1623	}
	1624
	1625	return $value;
	1626	}
	1627
	1628	=pod
	1629
	1630	=head2 B<prop_aliases()>
	1631
	1632	use Unicode::UCD 'prop_aliases';
	1633
	1634	my ($short_name, $full_name, @other_names) = prop_aliases("space");
	1635	my $same_full_name = prop_aliases("Space"); # Scalar context
	1636	my ($same_short_name) = prop_aliases("Space"); # gets 0th element
	1637	print "The full name is $full_name\n";
	1638	print "The short name is $short_name\n";
	1639	print "The other aliases are: ", join(", ", @other_names), "\n";
	1640
	1641	prints:
	1642	The full name is White_Space
	1643	The short name is WSpace
	1644	The other aliases are: Space
	1645
	1646	Most Unicode properties have several synonymous names. Typically, there is at
	1647	least a short name, convenient to type, and a long name that more fully
	1648	describes the property, and hence is more easily understood.
	1649
	1650	If you know one name for a Unicode property, you can use C<prop_aliases> to find
	1651	either the long name (when called in scalar context), or a list of all of the
	1652	names, somewhat ordered so that the short name is in the 0th element, the long
	1653	name in the next element, and any other synonyms are in the remaining
	1654	elements, in no particular order.
	1655
	1656	The long name is returned in a form nicely capitalized, suitable for printing.
	1657
	1658	The input parameter name is loosely matched, which means that white space,
	1659	hyphens, and underscores are ignored (except for the trailing underscore in
	1660	the old_form grandfathered-in C<"L_">, which is better written as C<"LC">, and
	1661	both of which mean C<General_Category=Cased Letter>).
	1662
	1663	If the name is unknown, C<undef> is returned (or an empty list in list
	1664	context). Note that Perl typically recognizes property names in regular
	1665	expressions with an optional C<"Is_>" (with or without the underscore)
	1666	prefixed to them, such as C<\p{isgc=punct}>. This function does not recognize
	1667	those in the input, returning C<undef>. Nor are they included in the output
	1668	as possible synonyms.
	1669
	1670	C<prop_aliases> does know about the Perl extensions to Unicode properties,
	1671	such as C<Any> and C<XPosixAlpha>, and the single form equivalents to Unicode
	1672	properties such as C<XDigit>, C<Greek>, C<In_Greek>, and C<Is_Greek>. The
	1673	final example demonstrates that the C<"Is_"> prefix is recognized for these
	1674	extensions; it is needed to resolve ambiguities. For example,
	1675	C<prop_aliases('lc')> returns the list C<(lc, Lowercase_Mapping)>, but
	1676	C<prop_aliases('islc')> returns C<(Is_LC, Cased_Letter)>. This is
	1677	because C<islc> is a Perl extension which is short for
	1678	C<General_Category=Cased Letter>. The lists returned for the Perl extensions
	1679	will not include the C<"Is_"> prefix (whether or not the input had it) unless
	1680	needed to resolve ambiguities, as shown in the C<"islc"> example, where the
	1681	returned list had one element containing C<"Is_">, and the other without.
	1682
	1683	It is also possible for the reverse to happen: C<prop_aliases('isc')> returns
	1684	the list C<(isc, ISO_Comment)>; whereas C<prop_aliases('c')> returns
	1685	C<(C, Other)> (the latter being a Perl extension meaning
	1686	C<General_Category=Other>.
	1687	L<perluniprops/Properties accessible through Unicode::UCD> lists the available
	1688	forms, including which ones are discouraged from use.
	1689
	1690	Those discouraged forms are accepted as input to C<prop_aliases>, but are not
	1691	returned in the lists. C<prop_aliases('isL&')> and C<prop_aliases('isL_')>,
	1692	which are old synonyms for C<"Is_LC"> and should not be used in new code, are
	1693	examples of this. These both return C<(Is_LC, Cased_Letter)>. Thus this
	1694	function allows you to take a discouraged form, and find its acceptable
	1695	alternatives. The same goes with single-form Block property equivalences.
	1696	Only the forms that begin with C<"In_"> are not discouraged; if you pass
	1697	C<prop_aliases> a discouraged form, you will get back the equivalent ones that
	1698	begin with C<"In_">. It will otherwise look like a new-style block name (see.
	1699	L</Old-style versus new-style block names>).
	1700
	1701	C<prop_aliases> does not know about any user-defined properties, and will
	1702	return C<undef> if called with one of those. Likewise for Perl internal
	1703	properties, with the exception of "Perl_Decimal_Digit" which it does know
	1704	about (and which is documented below in L</prop_invmap()>).
	1705
	1706	=cut
	1707
	1708	# It may be that there are use cases where the discouraged forms should be
	1709	# returned. If that comes up, an optional boolean second parameter to the
	1710	# function could be created, for example.
	1711
	1712	# These are created by mktables for this routine and stored in unicore/UCD.pl
	1713	# where their structures are described.
	1714	our %string_property_loose_to_name;
	1715	our %ambiguous_names;
	1716	our %loose_perlprop_to_name;
	1717	our %prop_aliases;
	1718
	1719	sub prop_aliases ($) {
	1720	my $prop = $_[0];
	1721	return unless defined $prop;
	1722
	1723	require "unicore/UCD.pl";
	1724	require "unicore/Heavy.pl";
	1725	require "utf8_heavy.pl";
	1726
	1727	# The property name may be loosely or strictly matched; we don't know yet.
	1728	# But both types use lower-case.
	1729	$prop = lc $prop;
	1730
	1731	# It is loosely matched if its lower case isn't known to be strict.
	1732	my $list_ref;
	1733	if (! exists $utf8::stricter_to_file_of{$prop}) {
	1734	my $loose = utf8::_loose_name($prop);
	1735
	1736	# There is a hash that converts from any loose name to its standard
	1737	# form, mapping all synonyms for a name to one name that can be used
	1738	# as a key into another hash. The whole concept is for memory
	1739	# savings, as the second hash doesn't have to have all the
	1740	# combinations. Actually, there are two hashes that do the
	1741	# converstion. One is used in utf8_heavy.pl (stored in Heavy.pl) for
	1742	# looking up properties matchable in regexes. This function needs to
	1743	# access string properties, which aren't available in regexes, so a
	1744	# second conversion hash is made for them (stored in UCD.pl). Look in
	1745	# the string one now, as the rest can have an optional 'is' prefix,
	1746	# which these don't.
	1747	if (exists $string_property_loose_to_name{$loose}) {
	1748
	1749	# Convert to its standard loose name.
	1750	$prop = $string_property_loose_to_name{$loose};
	1751	}
	1752	else {
	1753	my $retrying = 0; # bool. ? Has an initial 'is' been stripped
	1754	RETRY:
	1755	if (exists $utf8::loose_property_name_of{$loose}
	1756	&& (! $retrying
	1757	\|\| ! exists $ambiguous_names{$loose}))
	1758	{
	1759	# Found an entry giving the standard form. We don't get here
	1760	# (in the test above) when we've stripped off an
	1761	# 'is' and the result is an ambiguous name. That is because
	1762	# these are official Unicode properties (though Perl can have
	1763	# an optional 'is' prefix meaning the official property), and
	1764	# all ambiguous cases involve a Perl single-form extension
	1765	# for the gc, script, or block properties, and the stripped
	1766	# 'is' means that they mean one of those, and not one of
	1767	# these
	1768	$prop = $utf8::loose_property_name_of{$loose};
	1769	}
	1770	elsif (exists $loose_perlprop_to_name{$loose}) {
	1771
	1772	# This hash is specifically for this function to list Perl
	1773	# extensions that aren't in the earlier hashes. If there is
	1774	# only one element, the short and long names are identical.
	1775	# Otherwise the form is already in the same form as
	1776	# %prop_aliases, which is handled at the end of the function.
	1777	$list_ref = $loose_perlprop_to_name{$loose};
	1778	if (@$list_ref == 1) {
	1779	my @list = ($list_ref->[0], $list_ref->[0]);
	1780	$list_ref = \@list;
	1781	}
	1782	}
	1783	elsif (! exists $utf8::loose_to_file_of{$loose}) {
	1784
	1785	# loose_to_file_of is a complete list of loose names. If not
	1786	# there, the input is unknown.
	1787	return;
	1788	}
	1789	else {
	1790
	1791	# Here we found the name but not its aliases, so it has to
	1792	# exist. This means it must be one of the Perl single-form
	1793	# extensions. First see if it is for a property-value
	1794	# combination in one of the following properties.
	1795	my @list;
	1796	foreach my $property ("gc", "script") {
	1797	@list = prop_value_aliases($property, $loose);
	1798	last if @list;
	1799	}
	1800	if (@list) {
	1801
	1802	# Here, it is one of those property-value combination
	1803	# single-form synonyms. There are ambiguities with some
	1804	# of these. Check against the list for these, and adjust
	1805	# if necessary.
	1806	for my $i (0 .. @list -1) {
	1807	if (exists $ambiguous_names
	1808	{utf8::_loose_name(lc $list[$i])})
	1809	{
	1810	# The ambiguity is resolved by toggling whether or
	1811	# not it has an 'is' prefix
	1812	$list[$i] =~ s/^Is_// or $list[$i] =~ s/^/Is_/;
	1813	}
	1814	}
	1815	return @list;
	1816	}
	1817
	1818	# Here, it wasn't one of the gc or script single-form
	1819	# extensions. It could be a block property single-form
	1820	# extension. An 'in' prefix definitely means that, and should
	1821	# be looked up without the prefix. However, starting in
	1822	# Unicode 6.1, we have to special case 'indic...', as there
	1823	# is a property that begins with that name. We shouldn't
	1824	# strip the 'in' from that. I'm (khw) generalizing this to
	1825	# 'indic' instead of the single property, because I suspect
	1826	# that others of this class may come along in the future.
	1827	# However, this could backfire and a block created whose name
	1828	# begins with 'dic...', and we would want to strip the 'in'.
	1829	# At which point this would have to be tweaked.
	1830	my $began_with_in = $loose =~ s/^in(?!dic)//;
	1831	@list = prop_value_aliases("block", $loose);
	1832	if (@list) {
	1833	map { $_ =~ s/^/In_/ } @list;
	1834	return @list;
	1835	}
	1836
	1837	# Here still haven't found it. The last opportunity for it
	1838	# being valid is only if it began with 'is'. We retry without
	1839	# the 'is', setting a flag to that effect so that we don't
	1840	# accept things that begin with 'isis...'
	1841	if (! $retrying && ! $began_with_in && $loose =~ s/^is//) {
	1842	$retrying = 1;
	1843	goto RETRY;
	1844	}
	1845
	1846	# Here, didn't find it. Since it was in %loose_to_file_of, we
	1847	# should have been able to find it.
	1848	carp __PACKAGE__, "::prop_aliases: Unexpectedly could not find '$prop'. Send bug report to perlbug\@perl.org";
	1849	return;
	1850	}
	1851	}
	1852	}
	1853
	1854	if (! $list_ref) {
	1855	# Here, we have set $prop to a standard form name of the input. Look
	1856	# it up in the structure created by mktables for this purpose, which
	1857	# contains both strict and loosely matched properties. Avoid
	1858	# autovivifying.
	1859	$list_ref = $prop_aliases{$prop} if exists $prop_aliases{$prop};
	1860	return unless $list_ref;
	1861	}
	1862
	1863	# The full name is in element 1.
	1864	return $list_ref->[1] unless wantarray;
	1865
	1866	return @{_dclone $list_ref};
	1867	}
	1868
	1869	=pod
	1870
	1871	=head2 B<prop_value_aliases()>
	1872
	1873	use Unicode::UCD 'prop_value_aliases';
	1874
	1875	my ($short_name, $full_name, @other_names)
	1876	= prop_value_aliases("Gc", "Punct");
	1877	my $same_full_name = prop_value_aliases("Gc", "P"); # Scalar cntxt
	1878	my ($same_short_name) = prop_value_aliases("Gc", "P"); # gets 0th
	1879	# element
	1880	print "The full name is $full_name\n";
	1881	print "The short name is $short_name\n";
	1882	print "The other aliases are: ", join(", ", @other_names), "\n";
	1883
	1884	prints:
	1885	The full name is Punctuation
	1886	The short name is P
	1887	The other aliases are: Punct
	1888
	1889	Some Unicode properties have a restricted set of legal values. For example,
	1890	all binary properties are restricted to just C<true> or C<false>; and there
	1891	are only a few dozen possible General Categories.
	1892
	1893	For such properties, there are usually several synonyms for each possible
	1894	value. For example, in binary properties, I<truth> can be represented by any of
	1895	the strings "Y", "Yes", "T", or "True"; and the General Category
	1896	"Punctuation" by that string, or "Punct", or simply "P".
	1897
	1898	Like property names, there is typically at least a short name for each such
	1899	property-value, and a long name. If you know any name of the property-value,
	1900	you can use C<prop_value_aliases>() to get the long name (when called in
	1901	scalar context), or a list of all the names, with the short name in the 0th
	1902	element, the long name in the next element, and any other synonyms in the
	1903	remaining elements, in no particular order, except that any all-numeric
	1904	synonyms will be last.
	1905
	1906	The long name is returned in a form nicely capitalized, suitable for printing.
	1907
	1908	Case, white space, hyphens, and underscores are ignored in the input parameters
	1909	(except for the trailing underscore in the old-form grandfathered-in general
	1910	category property value C<"L_">, which is better written as C<"LC">).
	1911
	1912	If either name is unknown, C<undef> is returned. Note that Perl typically
	1913	recognizes property names in regular expressions with an optional C<"Is_>"
	1914	(with or without the underscore) prefixed to them, such as C<\p{isgc=punct}>.
	1915	This function does not recognize those in the property parameter, returning
	1916	C<undef>.
	1917
	1918	If called with a property that doesn't have synonyms for its values, it
	1919	returns the input value, possibly normalized with capitalization and
	1920	underscores.
	1921
	1922	For the block property, new-style block names are returned (see
	1923	L</Old-style versus new-style block names>).
	1924
	1925	To find the synonyms for single-forms, such as C<\p{Any}>, use
	1926	L</prop_aliases()> instead.
	1927
	1928	C<prop_value_aliases> does not know about any user-defined properties, and
	1929	will return C<undef> if called with one of those.
	1930
	1931	=cut
	1932
	1933	# These are created by mktables for this routine and stored in unicore/UCD.pl
	1934	# where their structures are described.
	1935	our %loose_to_standard_value;
	1936	our %prop_value_aliases;
	1937
	1938	sub prop_value_aliases ($$) {
	1939	my ($prop, $value) = @_;
	1940	return unless defined $prop && defined $value;
	1941
	1942	require "unicore/UCD.pl";
	1943	require "utf8_heavy.pl";
	1944
	1945	# Find the property name synonym that's used as the key in other hashes,
	1946	# which is element 0 in the returned list.
	1947	($prop) = prop_aliases($prop);
	1948	return if ! $prop;
	1949	$prop = utf8::_loose_name(lc $prop);
	1950
	1951	# Here is a legal property, but the hash below (created by mktables for
	1952	# this purpose) only knows about the properties that have a very finite
	1953	# number of potential values, that is not ones whose value could be
	1954	# anything, like most (if not all) string properties. These don't have
	1955	# synonyms anyway. Simply return the input. For example, there is no
	1956	# synonym for ('Uppercase_Mapping', A').
	1957	return $value if ! exists $prop_value_aliases{$prop};
	1958
	1959	# The value name may be loosely or strictly matched; we don't know yet.
	1960	# But both types use lower-case.
	1961	$value = lc $value;
	1962
	1963	# If the name isn't found under loose matching, it certainly won't be
	1964	# found under strict
	1965	my $loose_value = utf8::_loose_name($value);
	1966	return unless exists $loose_to_standard_value{"$prop=$loose_value"};
	1967
	1968	# Similarly if the combination under loose matching doesn't exist, it
	1969	# won't exist under strict.
	1970	my $standard_value = $loose_to_standard_value{"$prop=$loose_value"};
	1971	return unless exists $prop_value_aliases{$prop}{$standard_value};
	1972
	1973	# Here we did find a combination under loose matching rules. But it could
	1974	# be that is a strict property match that shouldn't have matched.
	1975	# %prop_value_aliases is set up so that the strict matches will appear as
	1976	# if they were in loose form. Thus, if the non-loose version is legal,
	1977	# we're ok, can skip the further check.
	1978	if (! exists $utf8::stricter_to_file_of{"$prop=$value"}
	1979
	1980	# We're also ok and skip the further check if value loosely matches.
	1981	# mktables has verified that no strict name under loose rules maps to
	1982	# an existing loose name. This code relies on the very limited
	1983	# circumstances that strict names can be here. Strict name matching
	1984	# happens under two conditions:
	1985	# 1) when the name begins with an underscore. But this function
	1986	# doesn't accept those, and %prop_value_aliases doesn't have
	1987	# them.
	1988	# 2) When the values are numeric, in which case we need to look
	1989	# further, but their squeezed-out loose values will be in
	1990	# %stricter_to_file_of
	1991	&& exists $utf8::stricter_to_file_of{"$prop=$loose_value"})
	1992	{
	1993	# The only thing that's legal loosely under strict is that can have an
	1994	# underscore between digit pairs XXX
	1995	while ($value =~ s/(\d)_(\d)/$1$2/g) {}
	1996	return unless exists $utf8::stricter_to_file_of{"$prop=$value"};
	1997	}
	1998
	1999	# Here, we know that the combination exists. Return it.
	2000	my $list_ref = $prop_value_aliases{$prop}{$standard_value};
	2001	if (@$list_ref > 1) {
	2002	# The full name is in element 1.
	2003	return $list_ref->[1] unless wantarray;
	2004
	2005	return @{_dclone $list_ref};
	2006	}
	2007
	2008	return $list_ref->[0] unless wantarray;
	2009
	2010	# Only 1 element means that it repeats
	2011	return ( $list_ref->[0], $list_ref->[0] );
	2012	}
	2013
	2014	# All 1 bits is the largest possible UV.
	2015	$Unicode::UCD::MAX_CP = ~0;
	2016
	2017	=pod
	2018
	2019	=head2 B<prop_invlist()>
	2020
	2021	C<prop_invlist> returns an inversion list (described below) that defines all the
	2022	code points for the binary Unicode property (or "property=value" pair) given
	2023	by the input parameter string:
	2024
	2025	use feature 'say';
	2026	use Unicode::UCD 'prop_invlist';
	2027	say join ", ", prop_invlist("Any");
	2028
	2029	prints:
	2030	0, 1114112
	2031
	2032	If the input is unknown C<undef> is returned in scalar context; an empty-list
	2033	in list context. If the input is known, the number of elements in
	2034	the list is returned if called in scalar context.
	2035
	2036	L<perluniprops\|perluniprops/Properties accessible through \p{} and \P{}> gives
	2037	the list of properties that this function accepts, as well as all the possible
	2038	forms for them (including with the optional "Is_" prefixes). (Except this
	2039	function doesn't accept any Perl-internal properties, some of which are listed
	2040	there.) This function uses the same loose or tighter matching rules for
	2041	resolving the input property's name as is done for regular expressions. These
	2042	are also specified in L<perluniprops\|perluniprops/Properties accessible
	2043	through \p{} and \P{}>. Examples of using the "property=value" form are:
	2044
	2045	say join ", ", prop_invlist("Script=Shavian");
	2046
	2047	prints:
	2048	66640, 66688
	2049
	2050	say join ", ", prop_invlist("ASCII_Hex_Digit=No");
	2051
	2052	prints:
	2053	0, 48, 58, 65, 71, 97, 103
	2054
	2055	say join ", ", prop_invlist("ASCII_Hex_Digit=Yes");
	2056
	2057	prints:
	2058	48, 58, 65, 71, 97, 103
	2059
	2060	Inversion lists are a compact way of specifying Unicode property-value
	2061	definitions. The 0th item in the list is the lowest code point that has the
	2062	property-value. The next item (item [1]) is the lowest code point beyond that
	2063	one that does NOT have the property-value. And the next item beyond that
	2064	([2]) is the lowest code point beyond that one that does have the
	2065	property-value, and so on. Put another way, each element in the list gives
	2066	the beginning of a range that has the property-value (for even numbered
	2067	elements), or doesn't have the property-value (for odd numbered elements).
	2068	The name for this data structure stems from the fact that each element in the
	2069	list toggles (or inverts) whether the corresponding range is or isn't on the
	2070	list.
	2071
	2072	In the final example above, the first ASCII Hex digit is code point 48, the
	2073	character "0", and all code points from it through 57 (a "9") are ASCII hex
	2074	digits. Code points 58 through 64 aren't, but 65 (an "A") through 70 (an "F")
	2075	are, as are 97 ("a") through 102 ("f"). 103 starts a range of code points
	2076	that aren't ASCII hex digits. That range extends to infinity, which on your
	2077	computer can be found in the variable C<$Unicode::UCD::MAX_CP>. (This
	2078	variable is as close to infinity as Perl can get on your platform, and may be
	2079	too high for some operations to work; you may wish to use a smaller number for
	2080	your purposes.)
	2081
	2082	Note that the inversion lists returned by this function can possibly include
	2083	non-Unicode code points, that is anything above 0x10FFFF. This is in
	2084	contrast to Perl regular expression matches on those code points, in which a
	2085	non-Unicode code point always fails to match. For example, both of these have
	2086	the same result:
	2087
	2088	chr(0x110000) =~ \p{ASCII_Hex_Digit=True} # Fails.
	2089	chr(0x110000) =~ \p{ASCII_Hex_Digit=False} # Fails!
	2090
	2091	And both raise a warning that a Unicode property is being used on a
	2092	non-Unicode code point. It is arguable as to which is the correct thing to do
	2093	here. This function has chosen the way opposite to the Perl regular
	2094	expression behavior. This allows you to easily flip to the Perl regular
	2095	expression way (for you to go in the other direction would be far harder).
	2096	Simply add 0x110000 at the end of the non-empty returned list if it isn't
	2097	already that value; and pop that value if it is; like:
	2098
	2099	my @list = prop_invlist("foo");
	2100	if (@list) {
	2101	if ($list[-1] == 0x110000) {
	2102	pop @list; # Defeat the turning on for above Unicode
	2103	}
	2104	else {
	2105	push @list, 0x110000; # Turn off for above Unicode
	2106	}
	2107	}
	2108
	2109	It is a simple matter to expand out an inversion list to a full list of all
	2110	code points that have the property-value:
	2111
	2112	my @invlist = prop_invlist($property_name);
	2113	die "empty" unless @invlist;
	2114	my @full_list;
	2115	for (my $i = 0; $i < @invlist; $i += 2) {
	2116	my $upper = ($i + 1) < @invlist
	2117	? $invlist[$i+1] - 1 # In range
	2118	: $Unicode::UCD::MAX_CP; # To infinity. You may want
	2119	# to stop much much earlier;
	2120	# going this high may expose
	2121	# perl deficiencies with very
	2122	# large numbers.
	2123	for my $j ($invlist[$i] .. $upper) {
	2124	push @full_list, $j;
	2125	}
	2126	}
	2127
	2128	C<prop_invlist> does not know about any user-defined nor Perl internal-only
	2129	properties, and will return C<undef> if called with one of those.
	2130
	2131	The L</search_invlist()> function is provided for finding a code point within
	2132	an inversion list.
	2133
	2134	=cut
	2135
	2136	# User-defined properties could be handled with some changes to utf8_heavy.pl;
	2137	# and implementing here of dealing with EXTRAS. If done, consideration should
	2138	# be given to the fact that the user subroutine could return different results
	2139	# with each call; security issues need to be thought about.
	2140
	2141	# These are created by mktables for this routine and stored in unicore/UCD.pl
	2142	# where their structures are described.
	2143	our %loose_defaults;
	2144	our $MAX_UNICODE_CODEPOINT;
	2145
	2146	sub prop_invlist ($;$) {
	2147	my $prop = $_[0];
	2148
	2149	# Undocumented way to get at Perl internal properties
	2150	my $internal_ok = defined $_[1] && $_[1] eq '_perl_core_internal_ok';
	2151
	2152	return if ! defined $prop;
	2153
	2154	require "utf8_heavy.pl";
	2155
	2156	# Warnings for these are only for regexes, so not applicable to us
	2157	no warnings 'deprecated';
	2158
	2159	# Get the swash definition of the property-value.
	2160	my $swash = utf8::SWASHNEW(__PACKAGE__, $prop, undef, 1, 0);
	2161
	2162	# Fail if not found, or isn't a boolean property-value, or is a
	2163	# user-defined property, or is internal-only.
	2164	return if ! $swash
	2165	\|\| ref $swash eq ""
	2166	\|\| $swash->{'BITS'} != 1
	2167	\|\| $swash->{'USER_DEFINED'}
	2168	\|\| (! $internal_ok && $prop =~ /^\s*_/);
	2169
	2170	if ($swash->{'EXTRAS'}) {
	2171	carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has EXTRAS magic";
	2172	return;
	2173	}
	2174	if ($swash->{'SPECIALS'}) {
	2175	carp __PACKAGE__, "::prop_invlist: swash returned for $prop unexpectedly has SPECIALS magic";
	2176	return;
	2177	}
	2178
	2179	my @invlist;
	2180
	2181	# The input lines look like:
	2182	# 0041\t005A # [26]
	2183	# 005F
	2184
	2185	# Split into lines, stripped of trailing comments
	2186	foreach my $range (split "\n",
	2187	$swash->{'LIST'} =~ s/ \s* (?: \# .* )? $ //xmgr)
	2188	{
	2189	# And find the beginning and end of the range on the line
	2190	my ($hex_begin, $hex_end) = split "\t", $range;
	2191	my $begin = hex $hex_begin;
	2192
	2193	# If the new range merely extends the old, we remove the marker
	2194	# created the last time through the loop for the old's end, which
	2195	# causes the new one's end to be used instead.
	2196	if (@invlist && $begin == $invlist[-1]) {
	2197	pop @invlist;
	2198	}
	2199	else {
	2200	# Add the beginning of the range
	2201	push @invlist, $begin;
	2202	}
	2203
	2204	if (defined $hex_end) { # The next item starts with the code point 1
	2205	# beyond the end of the range.
	2206	push @invlist, hex($hex_end) + 1;
	2207	}
	2208	else { # No end of range, is a single code point.
	2209	push @invlist, $begin + 1;
	2210	}
	2211	}
	2212
	2213	require "unicore/UCD.pl";
	2214	my $FIRST_NON_UNICODE = $MAX_UNICODE_CODEPOINT + 1;
	2215
	2216	# Could need to be inverted: add or subtract a 0 at the beginning of the
	2217	# list. And to keep it from matching non-Unicode, add or subtract the
	2218	# first non-unicode code point.
	2219	if ($swash->{'INVERT_IT'}) {
	2220	if (@invlist && $invlist[0] == 0) {
	2221	shift @invlist;
	2222	}
	2223	else {
	2224	unshift @invlist, 0;
	2225	}
	2226	if (@invlist && $invlist[-1] == $FIRST_NON_UNICODE) {
	2227	pop @invlist;
	2228	}
	2229	else {
	2230	push @invlist, $FIRST_NON_UNICODE;
	2231	}
	2232	}
	2233
	2234	# Here, the list is set up to include only Unicode code points. But, if
	2235	# the table is the default one for the property, it should contain all
	2236	# non-Unicode code points. First calculate the loose name for the
	2237	# property. This is done even for strict-name properties, as the data
	2238	# structure that mktables generates for us is set up so that we don't have
	2239	# to worry about that. The property-value needs to be split if compound,
	2240	# as the loose rules need to be independently calculated on each part. We
	2241	# know that it is syntactically valid, or SWASHNEW would have failed.
	2242
	2243	$prop = lc $prop;
	2244	my ($prop_only, $table) = split /\s[:=]\s/, $prop;
	2245	if ($table) {
	2246
	2247	# May have optional prefixed 'is'
	2248	$prop = utf8::_loose_name($prop_only) =~ s/^is//r;
	2249	$prop = $utf8::loose_property_name_of{$prop};
	2250	$prop .= "=" . utf8::_loose_name($table);
	2251	}
	2252	else {
	2253	$prop = utf8::_loose_name($prop);
	2254	}
	2255	if (exists $loose_defaults{$prop}) {
	2256
	2257	# Here, is the default table. If a range ended with 10ffff, instead
	2258	# continue that range to infinity, by popping the 110000; otherwise,
	2259	# add the range from 11000 to infinity
	2260	if (! @invlist \|\| $invlist[-1] != $FIRST_NON_UNICODE) {
	2261	push @invlist, $FIRST_NON_UNICODE;
	2262	}
	2263	else {
	2264	pop @invlist;
	2265	}
	2266	}
	2267
	2268	return @invlist;
	2269	}
	2270
	2271	=pod
	2272
	2273	=head2 B<prop_invmap()>
	2274
	2275	use Unicode::UCD 'prop_invmap';
	2276	my ($list_ref, $map_ref, $format, $default)
	2277	= prop_invmap("General Category");
	2278
	2279	C<prop_invmap> is used to get the complete mapping definition for a property,
	2280	in the form of an inversion map. An inversion map consists of two parallel
	2281	arrays. One is an ordered list of code points that mark range beginnings, and
	2282	the other gives the value (or mapping) that all code points in the
	2283	corresponding range have.
	2284
	2285	C<prop_invmap> is called with the name of the desired property. The name is
	2286	loosely matched, meaning that differences in case, white-space, hyphens, and
	2287	underscores are not meaningful (except for the trailing underscore in the
	2288	old-form grandfathered-in property C<"L_">, which is better written as C<"LC">,
	2289	or even better, C<"Gc=LC">).
	2290
	2291	Many Unicode properties have more than one name (or alias). C<prop_invmap>
	2292	understands all of these, including Perl extensions to them. Ambiguities are
	2293	resolved as described above for L</prop_aliases()>. The Perl internal
	2294	property "Perl_Decimal_Digit, described below, is also accepted. C<undef> is
	2295	returned if the property name is unknown.
	2296	See L<perluniprops/Properties accessible through Unicode::UCD> for the
	2297	properties acceptable as inputs to this function.
	2298
	2299	It is a fatal error to call this function except in list context.
	2300
	2301	In addition to the two arrays that form the inversion map, C<prop_invmap>
	2302	returns two other values; one is a scalar that gives some details as to the
	2303	format of the entries of the map array; the other is a default value, useful
	2304	in maps whose format name begins with the letter C<"a">, as described
	2305	L<below in its subsection\|/a>; and for specialized purposes, such as
	2306	converting to another data structure, described at the end of this main
	2307	section.
	2308
	2309	This means that C<prop_invmap> returns a 4 element list. For example,
	2310
	2311	my ($blocks_ranges_ref, $blocks_maps_ref, $format, $default)
	2312	= prop_invmap("Block");
	2313
	2314	In this call, the two arrays will be populated as shown below (for Unicode
	2315	6.0):
	2316
	2317	Index @blocks_ranges @blocks_maps
	2318	0 0x0000 Basic Latin
	2319	1 0x0080 Latin-1 Supplement
	2320	2 0x0100 Latin Extended-A
	2321	3 0x0180 Latin Extended-B
	2322	4 0x0250 IPA Extensions
	2323	5 0x02B0 Spacing Modifier Letters
	2324	6 0x0300 Combining Diacritical Marks
	2325	7 0x0370 Greek and Coptic
	2326	8 0x0400 Cyrillic
	2327	...
	2328	233 0x2B820 No_Block
	2329	234 0x2F800 CJK Compatibility Ideographs Supplement
	2330	235 0x2FA20 No_Block
	2331	236 0xE0000 Tags
	2332	237 0xE0080 No_Block
	2333	238 0xE0100 Variation Selectors Supplement
	2334	239 0xE01F0 No_Block
	2335	240 0xF0000 Supplementary Private Use Area-A
	2336	241 0x100000 Supplementary Private Use Area-B
	2337	242 0x110000 No_Block
	2338
	2339	The first line (with Index [0]) means that the value for code point 0 is "Basic
	2340	Latin". The entry "0x0080" in the @blocks_ranges column in the second line
	2341	means that the value from the first line, "Basic Latin", extends to all code
	2342	points in the range from 0 up to but not including 0x0080, that is, through
	2343	127. In other words, the code points from 0 to 127 are all in the "Basic
	2344	Latin" block. Similarly, all code points in the range from 0x0080 up to (but
	2345	not including) 0x0100 are in the block named "Latin-1 Supplement", etc.
	2346	(Notice that the return is the old-style block names; see L</Old-style versus
	2347	new-style block names>).
	2348
	2349	The final line (with Index [242]) means that the value for all code points above
	2350	the legal Unicode maximum code point have the value "No_Block", which is the
	2351	term Unicode uses for a non-existing block.
	2352
	2353	The arrays completely specify the mappings for all possible code points.
	2354	The final element in an inversion map returned by this function will always be
	2355	for the range that consists of all the code points that aren't legal Unicode,
	2356	but that are expressible on the platform. (That is, it starts with code point
	2357	0x110000, the first code point above the legal Unicode maximum, and extends to
	2358	infinity.) The value for that range will be the same that any typical
	2359	unassigned code point has for the specified property. (Certain unassigned
	2360	code points are not "typical"; for example the non-character code points, or
	2361	those in blocks that are to be written right-to-left. The above-Unicode
	2362	range's value is not based on these atypical code points.) It could be argued
	2363	that, instead of treating these as unassigned Unicode code points, the value
	2364	for this range should be C<undef>. If you wish, you can change the returned
	2365	arrays accordingly.
	2366
	2367	The maps for almost all properties are simple scalars that should be
	2368	interpreted as-is.
	2369	These values are those given in the Unicode-supplied data files, which may be
	2370	inconsistent as to capitalization and as to which synonym for a property-value
	2371	is given. The results may be normalized by using the L</prop_value_aliases()>
	2372	function.
	2373
	2374	There are exceptions to the simple scalar maps. Some properties have some
	2375	elements in their map list that are themselves lists of scalars; and some
	2376	special strings are returned that are not to be interpreted as-is. Element
	2377	[2] (placed into C<$format> in the example above) of the returned four element
	2378	list tells you if the map has any of these special elements or not, as follows:
	2379
	2380	=over
	2381
	2382	=item B<C<s>>
	2383
	2384	means all the elements of the map array are simple scalars, with no special
	2385	elements. Almost all properties are like this, like the C<block> example
	2386	above.
	2387
	2388	=item B<C<sl>>
	2389
	2390	means that some of the map array elements have the form given by C<"s">, and
	2391	the rest are lists of scalars. For example, here is a portion of the output
	2392	of calling C<prop_invmap>() with the "Script Extensions" property:
	2393
	2394	@scripts_ranges @scripts_maps
	2395	...
	2396	0x0953 Devanagari
	2397	0x0964 [ Bengali, Devanagari, Gurumukhi, Oriya ]
	2398	0x0966 Devanagari
	2399	0x0970 Common
	2400
	2401	Here, the code points 0x964 and 0x965 are both used in Bengali,
	2402	Devanagari, Gurmukhi, and Oriya, but no other scripts.
	2403
	2404	The Name_Alias property is also of this form. But each scalar consists of two
	2405	components: 1) the name, and 2) the type of alias this is. They are
	2406	separated by a colon and a space. In Unicode 6.1, there are several alias types:
	2407
	2408	=over
	2409
	2410	=item C<correction>
	2411
	2412	indicates that the name is a corrected form for the
	2413	original name (which remains valid) for the same code point.
	2414
	2415	=item C<control>
	2416
	2417	adds a new name for a control character.
	2418
	2419	=item C<alternate>
	2420
	2421	is an alternate name for a character
	2422
	2423	=item C<figment>
	2424
	2425	is a name for a character that has been documented but was never in any
	2426	actual standard.
	2427
	2428	=item C<abbreviation>
	2429
	2430	is a common abbreviation for a character
	2431
	2432	=back
	2433
	2434	The lists are ordered (roughly) so the most preferred names come before less
	2435	preferred ones.
	2436
	2437	For example,
	2438
	2439	@aliases_ranges @alias_maps
	2440	...
	2441	0x009E [ 'PRIVACY MESSAGE: control', 'PM: abbreviation' ]
	2442	0x009F [ 'APPLICATION PROGRAM COMMAND: control',
	2443	'APC: abbreviation'
	2444	]
	2445	0x00A0 'NBSP: abbreviation'
	2446	0x00A1 ""
	2447	0x00AD 'SHY: abbreviation'
	2448	0x00AE ""
	2449	0x01A2 'LATIN CAPITAL LETTER GHA: correction'
	2450	0x01A3 'LATIN SMALL LETTER GHA: correction'
	2451	0x01A4 ""
	2452	...
	2453
	2454	A map to the empty string means that there is no alias defined for the code
	2455	point.
	2456
	2457	=item B<C<a>>
	2458
	2459	is like C<"s"> in that all the map array elements are scalars, but here they are
	2460	restricted to all being integers, and some have to be adjusted (hence the name
	2461	C<"a">) to get the correct result. For example, in:
	2462
	2463	my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
	2464	= prop_invmap("Simple_Uppercase_Mapping");
	2465
	2466	the returned arrays look like this:
	2467
	2468	@$uppers_ranges_ref @$uppers_maps_ref Note
	2469	0 0
	2470	97 65 'a' maps to 'A', b => B ...
	2471	123 0
	2472	181 924 MICRO SIGN => Greek Cap MU
	2473	182 0
	2474	...
	2475
	2476	and C<$default> is 0.
	2477
	2478	Let's start with the second line. It says that the uppercase of code point 97
	2479	is 65; or C<uc("a")> == "A". But the line is for the entire range of code
	2480	points 97 through 122. To get the mapping for any code point in this range,
	2481	you take the offset it has from the beginning code point of the range, and add
	2482	that to the mapping for that first code point. So, the mapping for 122 ("z")
	2483	is derived by taking the offset of 122 from 97 (=25) and adding that to 65,
	2484	yielding 90 ("z"). Likewise for everything in between.
	2485
	2486	Requiring this simple adjustment allows the returned arrays to be
	2487	significantly smaller than otherwise, up to a factor of 10, speeding up
	2488	searching through them.
	2489
	2490	Ranges that map to C<$default>, C<"0">, behave somewhat differently. For
	2491	these, each code point maps to itself. So, in the first line in the example,
	2492	S<C<ord(uc(chr(0)))>> is 0, S<C<ord(uc(chr(1)))>> is 1, ..
	2493	S<C<ord(uc(chr(96)))>> is 96.
	2494
	2495	=item B<C<al>>
	2496
	2497	means that some of the map array elements have the form given by C<"a">, and
	2498	the rest are ordered lists of code points.
	2499	For example, in:
	2500
	2501	my ($uppers_ranges_ref, $uppers_maps_ref, $format, $default)
	2502	= prop_invmap("Uppercase_Mapping");
	2503
	2504	the returned arrays look like this:
	2505
	2506	@$uppers_ranges_ref @$uppers_maps_ref
	2507	0 0
	2508	97 65
	2509	123 0
	2510	181 924
	2511	182 0
	2512	...
	2513	0x0149 [ 0x02BC 0x004E ]
	2514	0x014A 0
	2515	0x014B 330
	2516	...
	2517
	2518	This is the full Uppercase_Mapping property (as opposed to the
	2519	Simple_Uppercase_Mapping given in the example for format C<"a">). The only
	2520	difference between the two in the ranges shown is that the code point at
	2521	0x0149 (LATIN SMALL LETTER N PRECEDED BY APOSTROPHE) maps to a string of two
	2522	characters, 0x02BC (MODIFIER LETTER APOSTROPHE) followed by 0x004E (LATIN
	2523	CAPITAL LETTER N).
	2524
	2525	No adjustments are needed to entries that are references to arrays; each such
	2526	entry will have exactly one element in its range, so the offset is always 0.
	2527
	2528	The fourth (index [3]) element (C<$default>) in the list returned for this
	2529	format is 0.
	2530
	2531	=item B<C<ae>>
	2532
	2533	This is like C<"a">, but some elements are the empty string, and should not be
	2534	adjusted.
	2535	The one internal Perl property accessible by C<prop_invmap> is of this type:
	2536	"Perl_Decimal_Digit" returns an inversion map which gives the numeric values
	2537	that are represented by the Unicode decimal digit characters. Characters that
	2538	don't represent decimal digits map to the empty string, like so:
	2539
	2540	@digits @values
	2541	0x0000 ""
	2542	0x0030 0
	2543	0x003A: ""
	2544	0x0660: 0
	2545	0x066A: ""
	2546	0x06F0: 0
	2547	0x06FA: ""
	2548	0x07C0: 0
	2549	0x07CA: ""
	2550	0x0966: 0
	2551	...
	2552
	2553	This means that the code points from 0 to 0x2F do not represent decimal digits;
	2554	the code point 0x30 (DIGIT ZERO) represents 0; code point 0x31, (DIGIT ONE),
	2555	represents 0+1-0 = 1; ... code point 0x39, (DIGIT NINE), represents 0+9-0 = 9;
	2556	... code points 0x3A through 0x65F do not represent decimal digits; 0x660
	2557	(ARABIC-INDIC DIGIT ZERO), represents 0; ... 0x07C1 (NKO DIGIT ONE),
	2558	represents 0+1-0 = 1 ...
	2559
	2560	The fourth (index [3]) element (C<$default>) in the list returned for this
	2561	format is the empty string.
	2562
	2563	=item B<C<ale>>
	2564
	2565	is a combination of the C<"al"> type and the C<"ae"> type. Some of
	2566	the map array elements have the forms given by C<"al">, and
	2567	the rest are the empty string. The property C<NFKC_Casefold> has this form.
	2568	An example slice is:
	2569
	2570	@$ranges_ref @$maps_ref Note
	2571	...
	2572	0x00AA 97 FEMININE ORDINAL INDICATOR => 'a'
	2573	0x00AB 0
	2574	0x00AD SOFT HYPHEN => ""
	2575	0x00AE 0
	2576	0x00AF [ 0x0020, 0x0304 ] MACRON => SPACE . COMBINING MACRON
	2577	0x00B0 0
	2578	...
	2579
	2580	The fourth (index [3]) element (C<$default>) in the list returned for this
	2581	format is 0.
	2582
	2583	=item B<C<ar>>
	2584
	2585	means that all the elements of the map array are either rational numbers or
	2586	the string C<"NaN">, meaning "Not a Number". A rational number is either an
	2587	integer, or two integers separated by a solidus (C<"/">). The second integer
	2588	represents the denominator of the division implied by the solidus, and is
	2589	actually always positive, so it is guaranteed not to be 0 and to not be
	2590	signed. When the element is a plain integer (without the
	2591	solidus), it may need to be adjusted to get the correct value by adding the
	2592	offset, just as other C<"a"> properties. No adjustment is needed for
	2593	fractions, as the range is guaranteed to have just a single element, and so
	2594	the offset is always 0.
	2595
	2596	If you want to convert the returned map to entirely scalar numbers, you
	2597	can use something like this:
	2598
	2599	my ($invlist_ref, $invmap_ref, $format) = prop_invmap($property);
	2600	if ($format && $format eq "ar") {
	2601	map { $_ = eval $_ if $_ ne 'NaN' } @$map_ref;
	2602	}
	2603
	2604	Here's some entries from the output of the property "Nv", which has format
	2605	C<"ar">.
	2606
	2607	@numerics_ranges @numerics_maps Note
	2608	0x00 "NaN"
	2609	0x30 0 DIGIT 0 .. DIGIT 9
	2610	0x3A "NaN"
	2611	0xB2 2 SUPERSCRIPTs 2 and 3
	2612	0xB4 "NaN"
	2613	0xB9 1 SUPERSCRIPT 1
	2614	0xBA "NaN"
	2615	0xBC 1/4 VULGAR FRACTION 1/4
	2616	0xBD 1/2 VULGAR FRACTION 1/2
	2617	0xBE 3/4 VULGAR FRACTION 3/4
	2618	0xBF "NaN"
	2619	0x660 0 ARABIC-INDIC DIGIT ZERO .. NINE
	2620	0x66A "NaN"
	2621
	2622	The fourth (index [3]) element (C<$default>) in the list returned for this
	2623	format is C<"NaN">.
	2624
	2625	=item B<C<n>>
	2626
	2627	means the Name property. All the elements of the map array are simple
	2628	scalars, but some of them contain special strings that require more work to
	2629	get the actual name.
	2630
	2631	Entries such as:
	2632
	2633	CJK UNIFIED IDEOGRAPH-<code point>
	2634
	2635	mean that the name for the code point is "CJK UNIFIED IDEOGRAPH-"
	2636	with the code point (expressed in hexadecimal) appended to it, like "CJK
	2637	UNIFIED IDEOGRAPH-3403" (similarly for S<C<CJK COMPATIBILITY IDEOGRAPH-E<lt>code
	2638	pointE<gt>>>).
	2639
	2640	Also, entries like
	2641
	2642	<hangul syllable>
	2643
	2644	means that the name is algorithmically calculated. This is easily done by
	2645	the function L<charnames/charnames::viacode(code)>.
	2646
	2647	Note that for control characters (C<Gc=cc>), Unicode's data files have the
	2648	string "C<E<lt>controlE<gt>>", but the real name of each of these characters is the empty
	2649	string. This function returns that real name, the empty string. (There are
	2650	names for these characters, but they are considered aliases, not the Name
	2651	property name, and are contained in the C<Name_Alias> property.)
	2652
	2653	=item B<C<ad>>
	2654
	2655	means the Decomposition_Mapping property. This property is like C<"al">
	2656	properties, except that one of the scalar elements is of the form:
	2657
	2658	<hangul syllable>
	2659
	2660	This signifies that this entry should be replaced by the decompositions for
	2661	all the code points whose decomposition is algorithmically calculated. (All
	2662	of them are currently in one range and no others outside the range are likely
	2663	to ever be added to Unicode; the C<"n"> format
	2664	has this same entry.) These can be generated via the function
	2665	L<Unicode::Normalize::NFD()\|Unicode::Normalize>.
	2666
	2667	Note that the mapping is the one that is specified in the Unicode data files,
	2668	and to get the final decomposition, it may need to be applied recursively.
	2669
	2670	The fourth (index [3]) element (C<$default>) in the list returned for this
	2671	format is 0.
	2672
	2673	=back
	2674
	2675	Note that a format begins with the letter "a" if and only the property it is
	2676	for requires adjustments by adding the offsets in multi-element ranges. For
	2677	all these properties, an entry should be adjusted only if the map is a scalar
	2678	which is an integer. That is, it must match the regular expression:
	2679
	2680	/ ^ -? \d+ $ /xa
	2681
	2682	Further, the first element in a range never needs adjustment, as the
	2683	adjustment would be just adding 0.
	2684
	2685	A binary search such as that provided by L</search_invlist()>, can be used to
	2686	quickly find a code point in the inversion list, and hence its corresponding
	2687	mapping.
	2688
	2689	The final, fourth element (index [3], assigned to C<$default> in the "block"
	2690	example) in the four element list returned by this function is used with the
	2691	C<"a"> format types; it may also be useful for applications
	2692	that wish to convert the returned inversion map data structure into some
	2693	other, such as a hash. It gives the mapping that most code points map to
	2694	under the property. If you establish the convention that any code point not
	2695	explicitly listed in your data structure maps to this value, you can
	2696	potentially make your data structure much smaller. As you construct your data
	2697	structure from the one returned by this function, simply ignore those ranges
	2698	that map to this value. For example, to
	2699	convert to the data structure searchable by L</charinrange()>, you can follow
	2700	this recipe for properties that don't require adjustments:
	2701
	2702	my ($list_ref, $map_ref, $format, $default) = prop_invmap($property);
	2703	my @range_list;
	2704
	2705	# Look at each element in the list, but the -2 is needed because we
	2706	# look at $i+1 in the loop, and the final element is guaranteed to map
	2707	# to $default by prop_invmap(), so we would skip it anyway.
	2708	for my $i (0 .. @$list_ref - 2) {
	2709	next if $map_ref->[$i] eq $default;
	2710	push @range_list, [ $list_ref->[$i],
	2711	$list_ref->[$i+1],
	2712	$map_ref->[$i]
	2713	];
	2714	}
	2715
	2716	print charinrange(\@range_list, $code_point), "\n";
	2717
	2718	With this, C<charinrange()> will return C<undef> if its input code point maps
	2719	to C<$default>. You can avoid this by omitting the C<next> statement, and adding
	2720	a line after the loop to handle the final element of the inversion map.
	2721
	2722	Similarly, this recipe can be used for properties that do require adjustments:
	2723
	2724	for my $i (0 .. @$list_ref - 2) {
	2725	next if $map_ref->[$i] eq $default;
	2726
	2727	# prop_invmap() guarantees that if the mapping is to an array, the
	2728	# range has just one element, so no need to worry about adjustments.
	2729	if (ref $map_ref->[$i]) {
	2730	push @range_list,
	2731	[ $list_ref->[$i], $list_ref->[$i], $map_ref->[$i] ];
	2732	}
	2733	else { # Otherwise each element is actually mapped to a separate
	2734	# value, so the range has to be split into single code point
	2735	# ranges.
	2736
	2737	my $adjustment = 0;
	2738
	2739	# For each code point that gets mapped to something...
	2740	for my $j ($list_ref->[$i] .. $list_ref->[$i+1] -1 ) {
	2741
	2742	# ... add a range consisting of just it mapping to the
	2743	# original plus the adjustment, which is incremented for the
	2744	# next time through the loop, as the offset increases by 1
	2745	# for each element in the range
	2746	push @range_list,
	2747	[ $j, $j, $map_ref->[$i] + $adjustment++ ];
	2748	}
	2749	}
	2750	}
	2751
	2752	Note that the inversion maps returned for the C<Case_Folding> and
	2753	C<Simple_Case_Folding> properties do not include the Turkic-locale mappings.
	2754	Use L</casefold()> for these.
	2755
	2756	C<prop_invmap> does not know about any user-defined properties, and will
	2757	return C<undef> if called with one of those.
	2758
	2759	=cut
	2760
	2761	# User-defined properties could be handled with some changes to utf8_heavy.pl;
	2762	# if done, consideration should be given to the fact that the user subroutine
	2763	# could return different results with each call, which could lead to some
	2764	# security issues.
	2765
	2766	# One could store things in memory so they don't have to be recalculated, but
	2767	# it is unlikely this will be called often, and some properties would take up
	2768	# significant memory.
	2769
	2770	# These are created by mktables for this routine and stored in unicore/UCD.pl
	2771	# where their structures are described.
	2772	our @algorithmic_named_code_points;
	2773	our $HANGUL_BEGIN;
	2774	our $HANGUL_COUNT;
	2775
	2776	sub prop_invmap ($) {
	2777
	2778	croak __PACKAGE__, "::prop_invmap: must be called in list context" unless wantarray;
	2779
	2780	my $prop = $_[0];
	2781	return unless defined $prop;
	2782
	2783	# Fail internal properties
	2784	return if $prop =~ /^_/;
	2785
	2786	# The values returned by this function.
	2787	my (@invlist, @invmap, $format, $missing);
	2788
	2789	# The swash has two components we look at, the base list, and a hash,
	2790	# named 'SPECIALS', containing any additional members whose mappings don't
	2791	# fit into the base list scheme of things. These generally 'override'
	2792	# any value in the base list for the same code point.
	2793	my $overrides;
	2794
	2795	require "utf8_heavy.pl";
	2796	require "unicore/UCD.pl";
	2797
	2798	RETRY:
	2799
	2800	# If there are multiple entries for a single code point
	2801	my $has_multiples = 0;
	2802
	2803	# Try to get the map swash for the property. They have 'To' prepended to
	2804	# the property name, and 32 means we will accept 32 bit return values.
	2805	# The 0 means we aren't calling this from tr///.
	2806	my $swash = utf8::SWASHNEW(__PACKAGE__, "To$prop", undef, 32, 0);
	2807
	2808	# If didn't find it, could be because needs a proxy. And if was the
	2809	# 'Block' or 'Name' property, use a proxy even if did find it. Finding it
	2810	# in these cases would be the result of the installation changing mktables
	2811	# to output the Block or Name tables. The Block table gives block names
	2812	# in the new-style, and this routine is supposed to return old-style block
	2813	# names. The Name table is valid, but we need to execute the special code
	2814	# below to add in the algorithmic-defined name entries.
	2815	# And NFKCCF needs conversion, so handle that here too.
	2816	if (ref $swash eq ""
	2817	\|\| $swash->{'TYPE'} =~ / ^ To (?: Blk \| Na \| NFKCCF ) $ /x)
	2818	{
	2819
	2820	# Get the short name of the input property, in standard form
	2821	my ($second_try) = prop_aliases($prop);
	2822	return unless $second_try;
	2823	$second_try = utf8::_loose_name(lc $second_try);
	2824
	2825	if ($second_try eq "in") {
	2826
	2827	# This property is identical to age for inversion map purposes
	2828	$prop = "age";
	2829	goto RETRY;
	2830	}
	2831	elsif ($second_try =~ / ^ s ( cf \| fc \| [ltu] c ) $ /x) {
	2832
	2833	# These properties use just the LIST part of the full mapping,
	2834	# which includes the simple maps that are otherwise overridden by
	2835	# the SPECIALS. So all we need do is to not look at the SPECIALS;
	2836	# set $overrides to indicate that
	2837	$overrides = -1;
	2838
	2839	# The full name is the simple name stripped of its initial 's'
	2840	$prop = $1;
	2841
	2842	# .. except for this case
	2843	$prop = 'cf' if $prop eq 'fc';
	2844
	2845	goto RETRY;
	2846	}
	2847	elsif ($second_try eq "blk") {
	2848
	2849	# We use the old block names. Just create a fake swash from its
	2850	# data.
	2851	_charblocks();
	2852	my %blocks;
	2853	$blocks{'LIST'} = "";
	2854	$blocks{'TYPE'} = "ToBlk";
	2855	$utf8::SwashInfo{ToBlk}{'missing'} = "No_Block";
	2856	$utf8::SwashInfo{ToBlk}{'format'} = "s";
	2857
	2858	foreach my $block (@BLOCKS) {
	2859	$blocks{'LIST'} .= sprintf "%x\t%x\t%s\n",
	2860	$block->[0],
	2861	$block->[1],
	2862	$block->[2];
	2863	}
	2864	$swash = \%blocks;
	2865	}
	2866	elsif ($second_try eq "na") {
	2867
	2868	# Use the combo file that has all the Name-type properties in it,
	2869	# extracting just the ones that are for the actual 'Name'
	2870	# property. And create a fake swash from it.
	2871	my %names;
	2872	$names{'LIST'} = "";
	2873	my $original = do "unicore/Name.pl";
	2874	my $algorithm_names = \@algorithmic_named_code_points;
	2875
	2876	# We need to remove the names from it that are aliases. For that
	2877	# we need to also read in that table. Create a hash with the keys
	2878	# being the code points, and the values being a list of the
	2879	# aliases for the code point key.
	2880	my ($aliases_code_points, $aliases_maps, undef, undef) =
	2881	&prop_invmap('Name_Alias');
	2882	my %aliases;
	2883	for (my $i = 0; $i < @$aliases_code_points; $i++) {
	2884	my $code_point = $aliases_code_points->[$i];
	2885	$aliases{$code_point} = $aliases_maps->[$i];
	2886
	2887	# If not already a list, make it into one, so that later we
	2888	# can treat things uniformly
	2889	if (! ref $aliases{$code_point}) {
	2890	$aliases{$code_point} = [ $aliases{$code_point} ];
	2891	}
	2892
	2893	# Remove the alias type from the entry, retaining just the
	2894	# name.
	2895	map { s/:.*// } @{$aliases{$code_point}};
	2896	}
	2897
	2898	my $i = 0;
	2899	foreach my $line (split "\n", $original) {
	2900	my ($hex_code_point, $name) = split "\t", $line;
	2901
	2902	# Weeds out all comments, blank lines, and named sequences
	2903	next if $hex_code_point =~ /[^[:xdigit:]]/a;
	2904
	2905	my $code_point = hex $hex_code_point;
	2906
	2907	# The name of all controls is the default: the empty string.
	2908	# The set of controls is immutable, so these hard-coded
	2909	# constants work.
	2910	next if $code_point <= 0x9F
	2911	&& ($code_point <= 0x1F \|\| $code_point >= 0x7F);
	2912
	2913	# If this is a name_alias, it isn't a name
	2914	next if grep { $_ eq $name } @{$aliases{$code_point}};
	2915
	2916	# If we are beyond where one of the special lines needs to
	2917	# be inserted ...
	2918	while ($i < @$algorithm_names
	2919	&& $code_point > $algorithm_names->[$i]->{'low'})
	2920	{
	2921
	2922	# ... then insert it, ahead of what we were about to
	2923	# output
	2924	$names{'LIST'} .= sprintf "%x\t%x\t%s\n",
	2925	$algorithm_names->[$i]->{'low'},
	2926	$algorithm_names->[$i]->{'high'},
	2927	$algorithm_names->[$i]->{'name'};
	2928
	2929	# Done with this range.
	2930	$i++;
	2931
	2932	# We loop until all special lines that precede the next
	2933	# regular one are output.
	2934	}
	2935
	2936	# Here, is a normal name.
	2937	$names{'LIST'} .= sprintf "%x\t\t%s\n", $code_point, $name;
	2938	} # End of loop through all the names
	2939
	2940	$names{'TYPE'} = "ToNa";
	2941	$utf8::SwashInfo{ToNa}{'missing'} = "";
	2942	$utf8::SwashInfo{ToNa}{'format'} = "n";
	2943	$swash = \%names;
	2944	}
	2945	elsif ($second_try =~ / ^ ( d [mt] ) $ /x) {
	2946
	2947	# The file is a combination of dt and dm properties. Create a
	2948	# fake swash from the portion that we want.
	2949	my $original = do "unicore/Decomposition.pl";
	2950	my %decomps;
	2951
	2952	if ($second_try eq 'dt') {
	2953	$decomps{'TYPE'} = "ToDt";
	2954	$utf8::SwashInfo{'ToDt'}{'missing'} = "None";
	2955	$utf8::SwashInfo{'ToDt'}{'format'} = "s";
	2956	} # 'dm' is handled below, with 'nfkccf'
	2957
	2958	$decomps{'LIST'} = "";
	2959
	2960	# This property has one special range not in the file: for the
	2961	# hangul syllables. But not in Unicode version 1.
	2962	UnicodeVersion() unless defined $v_unicode_version;
	2963	my $done_hangul = ($v_unicode_version lt v2.0.0)
	2964	? 1
	2965	: 0; # Have we done the hangul range ?
	2966	foreach my $line (split "\n", $original) {
	2967	my ($hex_lower, $hex_upper, $type_and_map) = split "\t", $line;
	2968	my $code_point = hex $hex_lower;
	2969	my $value;
	2970	my $redo = 0;
	2971
	2972	# The type, enclosed in <...>, precedes the mapping separated
	2973	# by blanks
	2974	if ($type_and_map =~ / ^ < ( .* ) > \s+ (.*) $ /x) {
	2975	$value = ($second_try eq 'dt') ? $1 : $2
	2976	}
	2977	else { # If there is no type specified, it's canonical
	2978	$value = ($second_try eq 'dt')
	2979	? "Canonical" :
	2980	$type_and_map;
	2981	}
	2982
	2983	# Insert the hangul range at the appropriate spot.
	2984	if (! $done_hangul && $code_point > $HANGUL_BEGIN) {
	2985	$done_hangul = 1;
	2986	$decomps{'LIST'} .=
	2987	sprintf "%x\t%x\t%s\n",
	2988	$HANGUL_BEGIN,
	2989	$HANGUL_BEGIN + $HANGUL_COUNT - 1,
	2990	($second_try eq 'dt')
	2991	? "Canonical"
	2992	: "<hangul syllable>";
	2993	}
	2994
	2995	if ($value =~ / / && $hex_upper ne "" && $hex_upper ne $hex_lower) {
	2996	$line = sprintf("%04X\t%s\t%s", hex($hex_lower) + 1, $hex_upper, $value);
	2997	$hex_upper = "";
	2998	$redo = 1;
	2999	}
	3000
	3001	# And append this to our constructed LIST.
	3002	$decomps{'LIST'} .= "$hex_lower\t$hex_upper\t$value\n";
	3003
	3004	redo if $redo;
	3005	}
	3006	$swash = \%decomps;
	3007	}
	3008	elsif ($second_try ne 'nfkccf') { # Don't know this property. Fail.
	3009	return;
	3010	}
	3011
	3012	if ($second_try eq 'nfkccf' \|\| $second_try eq 'dm') {
	3013
	3014	# The 'nfkccf' property is stored in the old format for backwards
	3015	# compatibility for any applications that has read its file
	3016	# directly before prop_invmap() existed.
	3017	# And the code above has extracted the 'dm' property from its file
	3018	# yielding the same format. So here we convert them to adjusted
	3019	# format for compatibility with the other properties similar to
	3020	# them.
	3021	my %revised_swash;
	3022
	3023	# We construct a new converted list.
	3024	my $list = "";
	3025
	3026	my @ranges = split "\n", $swash->{'LIST'};
	3027	for (my $i = 0; $i < @ranges; $i++) {
	3028	my ($hex_begin, $hex_end, $map) = split "\t", $ranges[$i];
	3029
	3030	# The dm property has maps that are space separated sequences
	3031	# of code points, as well as the special entry "<hangul
	3032	# syllable>, which also contains a blank.
	3033	my @map = split " ", $map;
	3034	if (@map > 1) {
	3035
	3036	# If it's just the special entry, append as-is.
	3037	if ($map eq '<hangul syllable>') {
	3038	$list .= "$ranges[$i]\n";
	3039	}
	3040	else {
	3041
	3042	# These should all be single-element ranges.
	3043	croak __PACKAGE__, "::prop_invmap: Not expecting a mapping with multiple code points in a multi-element range, $ranges[$i]" if $hex_end ne "" && $hex_end ne $hex_begin;
	3044
	3045	# Convert them to decimal, as that's what's expected.
	3046	$list .= "$hex_begin\t\t"
	3047	. join(" ", map { hex } @map)
	3048	. "\n";
	3049	}
	3050	next;
	3051	}
	3052
	3053	# Here, the mapping doesn't have a blank, is for a single code
	3054	# point.
	3055	my $begin = hex $hex_begin;
	3056	my $end = (defined $hex_end && $hex_end ne "")
	3057	? hex $hex_end
	3058	: $begin;
	3059
	3060	# Again, the output is to be in decimal.
	3061	my $decimal_map = hex $map;
	3062
	3063	# We know that multi-element ranges with the same mapping
	3064	# should not be adjusted, as after the adjustment
	3065	# multi-element ranges are for consecutive increasing code
	3066	# points. Further, the final element in the list won't be
	3067	# adjusted, as there is nothing after it to include in the
	3068	# adjustment
	3069	if ($begin != $end \|\| $i == @ranges -1) {
	3070
	3071	# So just convert these to single-element ranges
	3072	foreach my $code_point ($begin .. $end) {
	3073	$list .= sprintf("%04X\t\t%d\n",
	3074	$code_point, $decimal_map);
	3075	}
	3076	}
	3077	else {
	3078
	3079	# Here, we have a candidate for adjusting. What we do is
	3080	# look through the subsequent adjacent elements in the
	3081	# input. If the map to the next one differs by 1 from the
	3082	# one before, then we combine into a larger range with the
	3083	# initial map. Loop doing this until we find one that
	3084	# can't be combined.
	3085
	3086	my $offset = 0; # How far away are we from the initial
	3087	# map
	3088	my $squished = 0; # ? Did we squish at least two
	3089	# elements together into one range
	3090	for ( ; $i < @ranges; $i++) {
	3091	my ($next_hex_begin, $next_hex_end, $next_map)
	3092	= split "\t", $ranges[$i+1];
	3093
	3094	# In the case of 'dm', the map may be a sequence of
	3095	# multiple code points, which are never combined with
	3096	# another range
	3097	last if $next_map =~ / /;
	3098
	3099	$offset++;
	3100	my $next_decimal_map = hex $next_map;
	3101
	3102	# If the next map is not next in sequence, it
	3103	# shouldn't be combined.
	3104	last if $next_decimal_map != $decimal_map + $offset;
	3105
	3106	my $next_begin = hex $next_hex_begin;
	3107
	3108	# Likewise, if the next element isn't adjacent to the
	3109	# previous one, it shouldn't be combined.
	3110	last if $next_begin != $begin + $offset;
	3111
	3112	my $next_end = (defined $next_hex_end
	3113	&& $next_hex_end ne "")
	3114	? hex $next_hex_end
	3115	: $next_begin;
	3116
	3117	# And finally, if the next element is a multi-element
	3118	# range, it shouldn't be combined.
	3119	last if $next_end != $next_begin;
	3120
	3121	# Here, we will combine. Loop to see if we should
	3122	# combine the next element too.
	3123	$squished = 1;
	3124	}
	3125
	3126	if ($squished) {
	3127
	3128	# Here, 'i' is the element number of the last element to
	3129	# be combined, and the range is single-element, or we
	3130	# wouldn't be combining. Get it's code point.
	3131	my ($hex_end, undef, undef) = split "\t", $ranges[$i];
	3132	$list .= "$hex_begin\t$hex_end\t$decimal_map\n";
	3133	} else {
	3134
	3135	# Here, no combining done. Just append the initial
	3136	# (and current) values.
	3137	$list .= "$hex_begin\t\t$decimal_map\n";
	3138	}
	3139	}
	3140	} # End of loop constructing the converted list
	3141
	3142	# Finish up the data structure for our converted swash
	3143	my $type = ($second_try eq 'nfkccf') ? 'ToNFKCCF' : 'ToDm';
	3144	$revised_swash{'LIST'} = $list;
	3145	$revised_swash{'TYPE'} = $type;
	3146	$revised_swash{'SPECIALS'} = $swash->{'SPECIALS'};
	3147	$swash = \%revised_swash;
	3148
	3149	$utf8::SwashInfo{$type}{'missing'} = 0;
	3150	$utf8::SwashInfo{$type}{'format'} = 'a';
	3151	}
	3152	}
	3153
	3154	if ($swash->{'EXTRAS'}) {
	3155	carp __PACKAGE__, "::prop_invmap: swash returned for $prop unexpectedly has EXTRAS magic";
	3156	return;
	3157	}
	3158
	3159	# Here, have a valid swash return. Examine it.
	3160	my $returned_prop = $swash->{'TYPE'};
	3161
	3162	# All properties but binary ones should have 'missing' and 'format'
	3163	# entries
	3164	$missing = $utf8::SwashInfo{$returned_prop}{'missing'};
	3165	$missing = 'N' unless defined $missing;
	3166
	3167	$format = $utf8::SwashInfo{$returned_prop}{'format'};
	3168	$format = 'b' unless defined $format;
	3169
	3170	my $requires_adjustment = $format =~ /^a/;
	3171
	3172	# The LIST input lines look like:
	3173	# ...
	3174	# 0374\t\tCommon
	3175	# 0375\t0377\tGreek # [3]
	3176	# 037A\t037D\tGreek # [4]
	3177	# 037E\t\tCommon
	3178	# 0384\t\tGreek
	3179	# ...
	3180	#
	3181	# Convert them to like
	3182	# 0374 => Common
	3183	# 0375 => Greek
	3184	# 0378 => $missing
	3185	# 037A => Greek
	3186	# 037E => Common
	3187	# 037F => $missing
	3188	# 0384 => Greek
	3189	#
	3190	# For binary properties, the final non-comment column is absent, and
	3191	# assumed to be 'Y'.
	3192
	3193	foreach my $range (split "\n", $swash->{'LIST'}) {
	3194	$range =~ s/ \s* (?: \# .* )? $ //xg; # rmv trailing space, comments
	3195
	3196	# Find the beginning and end of the range on the line
	3197	my ($hex_begin, $hex_end, $map) = split "\t", $range;
	3198	my $begin = hex $hex_begin;
	3199	my $end = (defined $hex_end && $hex_end ne "")
	3200	? hex $hex_end
	3201	: $begin;
	3202
	3203	# Each time through the loop (after the first):
	3204	# $invlist[-2] contains the beginning of the previous range processed
	3205	# $invlist[-1] contains the end+1 of the previous range processed
	3206	# $invmap[-2] contains the value of the previous range processed
	3207	# $invmap[-1] contains the default value for missing ranges ($missing)
	3208	#
	3209	# Thus, things are set up for the typical case of a new non-adjacent
	3210	# range of non-missings to be added. But, if the new range is
	3211	# adjacent, it needs to replace the [-1] element; and if the new
	3212	# range is a multiple value of the previous one, it needs to be added
	3213	# to the [-2] map element.
	3214
	3215	# The first time through, everything will be empty. If the property
	3216	# doesn't have a range that begins at 0, add one that maps to $missing
	3217	if (! @invlist) {
	3218	if ($begin != 0) {
	3219	push @invlist, 0;
	3220	push @invmap, $missing;
	3221	}
	3222	}
	3223	elsif (@invlist > 1 && $invlist[-2] == $begin) {
	3224
	3225	# Here we handle the case where the input has multiple entries for
	3226	# each code point. mktables should have made sure that each such
	3227	# range contains only one code point. At this point, $invlist[-1]
	3228	# is the $missing that was added at the end of the last loop
	3229	# iteration, and [-2] is the last real input code point, and that
	3230	# code point is the same as the one we are adding now, making the
	3231	# new one a multiple entry. Add it to the existing entry, either
	3232	# by pushing it to the existing list of multiple entries, or
	3233	# converting the single current entry into a list with both on it.
	3234	# This is all we need do for this iteration.
	3235
	3236	if ($end != $begin) {
	3237	croak __PACKAGE__, ":prop_invmap: Multiple maps per code point in '$prop' require single-element ranges: begin=$begin, end=$end, map=$map";
	3238	}
	3239	if (! ref $invmap[-2]) {
	3240	$invmap[-2] = [ $invmap[-2], $map ];
	3241	}
	3242	else {
	3243	push @{$invmap[-2]}, $map;
	3244	}
	3245	$has_multiples = 1;
	3246	next;
	3247	}
	3248	elsif ($invlist[-1] == $begin) {
	3249
	3250	# If the input isn't in the most compact form, so that there are
	3251	# two adjacent ranges that map to the same thing, they should be
	3252	# combined (EXCEPT where the arrays require adjustments, in which
	3253	# case everything is already set up correctly). This happens in
	3254	# our constructed dt mapping, as Element [-2] is the map for the
	3255	# latest range so far processed. Just set the beginning point of
	3256	# the map to $missing (in invlist[-1]) to 1 beyond where this
	3257	# range ends. For example, in
	3258	# 12\t13\tXYZ
	3259	# 14\t17\tXYZ
	3260	# we have set it up so that it looks like
	3261	# 12 => XYZ
	3262	# 14 => $missing
	3263	#
	3264	# We now see that it should be
	3265	# 12 => XYZ
	3266	# 18 => $missing
	3267	if (! $requires_adjustment && @invlist > 1 && ( (defined $map)
	3268	? $invmap[-2] eq $map
	3269	: $invmap[-2] eq 'Y'))
	3270	{
	3271	$invlist[-1] = $end + 1;
	3272	next;
	3273	}
	3274
	3275	# Here, the range started in the previous iteration that maps to
	3276	# $missing starts at the same code point as this range. That
	3277	# means there is no gap to fill that that range was intended for,
	3278	# so we just pop it off the parallel arrays.
	3279	pop @invlist;
	3280	pop @invmap;
	3281	}
	3282
	3283	# Add the range beginning, and the range's map.
	3284	push @invlist, $begin;
	3285	if ($returned_prop eq 'ToDm') {
	3286
	3287	# The decomposition maps are either a line like <hangul syllable>
	3288	# which are to be taken as is; or a sequence of code points in hex
	3289	# and separated by blanks. Convert them to decimal, and if there
	3290	# is more than one, use an anonymous array as the map.
	3291	if ($map =~ /^ < /x) {
	3292	push @invmap, $map;
	3293	}
	3294	else {
	3295	my @map = split " ", $map;
	3296	if (@map == 1) {
	3297	push @invmap, $map[0];
	3298	}
	3299	else {
	3300	push @invmap, \@map;
	3301	}
	3302	}
	3303	}
	3304	else {
	3305
	3306	# Otherwise, convert hex formatted list entries to decimal; add a
	3307	# 'Y' map for the missing value in binary properties, or
	3308	# otherwise, use the input map unchanged.
	3309	$map = ($format eq 'x')
	3310	? hex $map
	3311	: $format eq 'b'
	3312	? 'Y'
	3313	: $map;
	3314	push @invmap, $map;
	3315	}
	3316
	3317	# We just started a range. It ends with $end. The gap between it and
	3318	# the next element in the list must be filled with a range that maps
	3319	# to the default value. If there is no gap, the next iteration will
	3320	# pop this, unless there is no next iteration, and we have filled all
	3321	# of the Unicode code space, so check for that and skip.
	3322	if ($end < $MAX_UNICODE_CODEPOINT) {
	3323	push @invlist, $end + 1;
	3324	push @invmap, $missing;
	3325	}
	3326	}
	3327
	3328	# If the property is empty, make all code points use the value for missing
	3329	# ones.
	3330	if (! @invlist) {
	3331	push @invlist, 0;
	3332	push @invmap, $missing;
	3333	}
	3334
	3335	# And add in standard element that all non-Unicode code points map to:
	3336	# $missing
	3337	push @invlist, $MAX_UNICODE_CODEPOINT + 1;
	3338	push @invmap, $missing;
	3339
	3340	# The second component of the map are those values that require
	3341	# non-standard specification, stored in SPECIALS. These override any
	3342	# duplicate code points in LIST. If we are using a proxy, we may have
	3343	# already set $overrides based on the proxy.
	3344	$overrides = $swash->{'SPECIALS'} unless defined $overrides;
	3345	if ($overrides) {
	3346
	3347	# A negative $overrides implies that the SPECIALS should be ignored,
	3348	# and a simple 'a' list is the value.
	3349	if ($overrides < 0) {
	3350	$format = 'a';
	3351	}
	3352	else {
	3353
	3354	# Currently, all overrides are for properties that normally map to
	3355	# single code points, but now some will map to lists of code
	3356	# points (but there is an exception case handled below).
	3357	$format = 'al';
	3358
	3359	# Look through the overrides.
	3360	foreach my $cp_maybe_utf8 (keys %$overrides) {
	3361	my $cp;
	3362	my @map;
	3363
	3364	# If the overrides came from SPECIALS, the code point keys are
	3365	# packed UTF-8.
	3366	if ($overrides == $swash->{'SPECIALS'}) {
	3367	$cp = unpack("C0U", $cp_maybe_utf8);
	3368	@map = unpack "U0U*", $swash->{'SPECIALS'}{$cp_maybe_utf8};
	3369
	3370	# The empty string will show up unpacked as an empty
	3371	# array.
	3372	$format = 'ale' if @map == 0;
	3373	}
	3374	else {
	3375
	3376	# But if we generated the overrides, we didn't bother to
	3377	# pack them, and we, so far, do this only for properties
	3378	# that are 'a' ones.
	3379	$cp = $cp_maybe_utf8;
	3380	@map = hex $overrides->{$cp};
	3381	$format = 'a';
	3382	}
	3383
	3384	# Find the range that the override applies to.
	3385	my $i = search_invlist(\@invlist, $cp);
	3386	if ($cp < $invlist[$i] \|\| $cp >= $invlist[$i + 1]) {
	3387	croak __PACKAGE__, "::prop_invmap: wrong_range, cp=$cp; i=$i, current=$invlist[$i]; next=$invlist[$i + 1]"
	3388	}
	3389
	3390	# And what that range currently maps to
	3391	my $cur_map = $invmap[$i];
	3392
	3393	# If there is a gap between the next range and the code point
	3394	# we are overriding, we have to add elements to both arrays to
	3395	# fill that gap, using the map that applies to it, which is
	3396	# $cur_map, since it is part of the current range.
	3397	if ($invlist[$i + 1] > $cp + 1) {
	3398	#use feature 'say';
	3399	#say "Before splice:";
	3400	#say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
	3401	#say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
	3402	#say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
	3403	#say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
	3404	#say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
	3405
	3406	splice @invlist, $i + 1, 0, $cp + 1;
	3407	splice @invmap, $i + 1, 0, $cur_map;
	3408
	3409	#say "After splice:";
	3410	#say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
	3411	#say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
	3412	#say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
	3413	#say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
	3414	#say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
	3415	}
	3416
	3417	# If the remaining portion of the range is multiple code
	3418	# points (ending with the one we are replacing, guaranteed by
	3419	# the earlier splice). We must split it into two
	3420	if ($invlist[$i] < $cp) {
	3421	$i++; # Compensate for the new element
	3422
	3423	#use feature 'say';
	3424	#say "Before splice:";
	3425	#say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
	3426	#say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
	3427	#say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
	3428	#say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
	3429	#say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
	3430
	3431	splice @invlist, $i, 0, $cp;
	3432	splice @invmap, $i, 0, 'dummy';
	3433
	3434	#say "After splice:";
	3435	#say 'i-2=[', $i-2, ']', sprintf("%04X maps to %s", $invlist[$i-2], $invmap[$i-2]) if $i >= 2;
	3436	#say 'i-1=[', $i-1, ']', sprintf("%04X maps to %s", $invlist[$i-1], $invmap[$i-1]) if $i >= 1;
	3437	#say 'i =[', $i, ']', sprintf("%04X maps to %s", $invlist[$i], $invmap[$i]);
	3438	#say 'i+1=[', $i+1, ']', sprintf("%04X maps to %s", $invlist[$i+1], $invmap[$i+1]) if $i < @invlist + 1;
	3439	#say 'i+2=[', $i+2, ']', sprintf("%04X maps to %s", $invlist[$i+2], $invmap[$i+2]) if $i < @invlist + 2;
	3440	}
	3441
	3442	# Here, the range we are overriding contains a single code
	3443	# point. The result could be the empty string, a single
	3444	# value, or a list. If the last case, we use an anonymous
	3445	# array.
	3446	$invmap[$i] = (scalar @map == 0)
	3447	? ""
	3448	: (scalar @map > 1)
	3449	? \@map
	3450	: $map[0];
	3451	}
	3452	}
	3453	}
	3454	elsif ($format eq 'x') {
	3455
	3456	# All hex-valued properties are really to code points, and have been
	3457	# converted to decimal.
	3458	$format = 's';
	3459	}
	3460	elsif ($returned_prop eq 'ToDm') {
	3461	$format = 'ad';
	3462	}
	3463	elsif ($format eq 'sw') { # blank-separated elements to form a list.
	3464	map { $_ = [ split " ", $_ ] if $_ =~ / / } @invmap;
	3465	$format = 'sl';
	3466	}
	3467	elsif ($returned_prop eq 'ToNameAlias') {
	3468
	3469	# This property currently doesn't have any lists, but theoretically
	3470	# could
	3471	$format = 'sl';
	3472	}
	3473	elsif ($returned_prop eq 'ToPerlDecimalDigit') {
	3474	$format = 'ae';
	3475	}
	3476	elsif ($returned_prop eq 'ToNv') {
	3477
	3478	# The one property that has this format is stored as a delta, so needs
	3479	# to indicate that need to add code point to it.
	3480	$format = 'ar';
	3481	}
	3482	elsif ($format ne 'n' && $format ne 'a') {
	3483
	3484	# All others are simple scalars
	3485	$format = 's';
	3486	}
	3487	if ($has_multiples && $format !~ /l/) {
	3488	croak __PACKAGE__, "::prop_invmap: Wrong format '$format' for prop_invmap('$prop'); should indicate has lists";
	3489	}
	3490
	3491	return (\@invlist, \@invmap, $format, $missing);
	3492	}
	3493
	3494	sub search_invlist {
	3495
	3496	=pod
	3497
	3498	=head2 B<search_invlist()>
	3499
	3500	use Unicode::UCD qw(prop_invmap prop_invlist);
	3501	use Unicode::UCD 'search_invlist';
	3502
	3503	my @invlist = prop_invlist($property_name);
	3504	print $code_point, ((search_invlist(\@invlist, $code_point) // -1) % 2)
	3505	? " isn't"
	3506	: " is",
	3507	" in $property_name\n";
	3508
	3509	my ($blocks_ranges_ref, $blocks_map_ref) = prop_invmap("Block");
	3510	my $index = search_invlist($blocks_ranges_ref, $code_point);
	3511	print "$code_point is in block ", $blocks_map_ref->[$index], "\n";
	3512
	3513	C<search_invlist> is used to search an inversion list returned by
	3514	C<prop_invlist> or C<prop_invmap> for a particular L</code point argument>.
	3515	C<undef> is returned if the code point is not found in the inversion list
	3516	(this happens only when it is not a legal L<code point argument>, or is less
	3517	than the list's first element). A warning is raised in the first instance.
	3518
	3519	Otherwise, it returns the index into the list of the range that contains the
	3520	code point.; that is, find C<i> such that
	3521
	3522	list[i]<= code_point < list[i+1].
	3523
	3524	As explained in L</prop_invlist()>, whether a code point is in the list or not
	3525	depends on if the index is even (in) or odd (not in). And as explained in
	3526	L</prop_invmap()>, the index is used with the returned parallel array to find
	3527	the mapping.
	3528
	3529	=cut
	3530
	3531
	3532	my $list_ref = shift;
	3533	my $input_code_point = shift;
	3534	my $code_point = _getcode($input_code_point);
	3535
	3536	if (! defined $code_point) {
	3537	carp __PACKAGE__, "::search_invlist: unknown code '$input_code_point'";
	3538	return;
	3539	}
	3540
	3541	my $max_element = @$list_ref - 1;
	3542
	3543	# Return undef if list is empty or requested item is before the first element.
	3544	return if $max_element < 0;
	3545	return if $code_point < $list_ref->[0];
	3546
	3547	# Short cut something at the far-end of the table. This also allows us to
	3548	# refer to element [$i+1] without fear of being out-of-bounds in the loop
	3549	# below.
	3550	return $max_element if $code_point >= $list_ref->[$max_element];
	3551
	3552	use integer; # want integer division
	3553
	3554	my $i = $max_element / 2;
	3555
	3556	my $lower = 0;
	3557	my $upper = $max_element;
	3558	while (1) {
	3559
	3560	if ($code_point >= $list_ref->[$i]) {
	3561
	3562	# Here we have met the lower constraint. We can quit if we
	3563	# also meet the upper one.
	3564	last if $code_point < $list_ref->[$i+1];
	3565
	3566	$lower = $i; # Still too low.
	3567
	3568	}
	3569	else {
	3570
	3571	# Here, $code_point < $list_ref[$i], so look lower down.
	3572	$upper = $i;
	3573	}
	3574
	3575	# Split search domain in half to try again.
	3576	my $temp = ($upper + $lower) / 2;
	3577
	3578	# No point in continuing unless $i changes for next time
	3579	# in the loop.
	3580	return $i if $temp == $i;
	3581	$i = $temp;
	3582	} # End of while loop
	3583
	3584	# Here we have found the offset
	3585	return $i;
	3586	}
	3587
	3588	=head2 Unicode::UCD::UnicodeVersion
	3589
	3590	This returns the version of the Unicode Character Database, in other words, the
	3591	version of the Unicode standard the database implements. The version is a
	3592	string of numbers delimited by dots (C<'.'>).
	3593
	3594	=cut
	3595
	3596	my $UNICODEVERSION;
	3597
	3598	sub UnicodeVersion {
	3599	unless (defined $UNICODEVERSION) {
	3600	openunicode(\$VERSIONFH, "version");
	3601	local $/ = "\n";
	3602	chomp($UNICODEVERSION = <$VERSIONFH>);
	3603	close($VERSIONFH);
	3604	croak __PACKAGE__, "::VERSION: strange version '$UNICODEVERSION'"
	3605	unless $UNICODEVERSION =~ /^\d+(?:\.\d+)+$/;
	3606	}
	3607	$v_unicode_version = pack "C*", split /\./, $UNICODEVERSION;
	3608	return $UNICODEVERSION;
	3609	}
	3610
	3611	=head2 B<Blocks versus Scripts>
	3612
	3613	The difference between a block and a script is that scripts are closer
	3614	to the linguistic notion of a set of code points required to present
	3615	languages, while block is more of an artifact of the Unicode code point
	3616	numbering and separation into blocks of consecutive code points (so far the
	3617	size of a block is some multiple of 16, like 128 or 256).
	3618
	3619	For example the Latin B<script> is spread over several B<blocks>, such
	3620	as C<Basic Latin>, C<Latin 1 Supplement>, C<Latin Extended-A>, and
	3621	C<Latin Extended-B>. On the other hand, the Latin script does not
	3622	contain all the characters of the C<Basic Latin> block (also known as
	3623	ASCII): it includes only the letters, and not, for example, the digits
	3624	or the punctuation.
	3625
	3626	For blocks see L<http://www.unicode.org/Public/UNIDATA/Blocks.txt>
	3627
	3628	For scripts see UTR #24: L<http://www.unicode.org/unicode/reports/tr24/>
	3629
	3630	=head2 B<Matching Scripts and Blocks>
	3631
	3632	Scripts are matched with the regular-expression construct
	3633	C<\p{...}> (e.g. C<\p{Tibetan}> matches characters of the Tibetan script),
	3634	while C<\p{Blk=...}> is used for blocks (e.g. C<\p{Blk=Tibetan}> matches
	3635	any of the 256 code points in the Tibetan block).
	3636
	3637	=head2 Old-style versus new-style block names
	3638
	3639	Unicode publishes the names of blocks in two different styles, though the two
	3640	are equivalent under Unicode's loose matching rules.
	3641
	3642	The original style uses blanks and hyphens in the block names (except for
	3643	C<No_Block>), like so:
	3644
	3645	Miscellaneous Mathematical Symbols-B
	3646
	3647	The newer style replaces these with underscores, like this:
	3648
	3649	Miscellaneous_Mathematical_Symbols_B
	3650
	3651	This newer style is consistent with the values of other Unicode properties.
	3652	To preserve backward compatibility, all the functions in Unicode::UCD that
	3653	return block names (except one) return the old-style ones. That one function,
	3654	L</prop_value_aliases()> can be used to convert from old-style to new-style:
	3655
	3656	my $new_style = prop_values_aliases("block", $old_style);
	3657
	3658	Perl also has single-form extensions that refer to blocks, C<In_Cyrillic>,
	3659	meaning C<Block=Cyrillic>. These have always been written in the new style.
	3660
	3661	To convert from new-style to old-style, follow this recipe:
	3662
	3663	$old_style = charblock((prop_invlist("block=$new_style"))[0]);
	3664
	3665	(which finds the range of code points in the block using C<prop_invlist>,
	3666	gets the lower end of the range (0th element) and then looks up the old name
	3667	for its block using C<charblock>).
	3668
	3669	Note that starting in Unicode 6.1, many of the block names have shorter
	3670	synonyms. These are always given in the new style.
	3671
	3672	=head1 BUGS
	3673
	3674	Does not yet support EBCDIC platforms.
	3675
	3676	=head1 AUTHOR
	3677
	3678	Jarkko Hietaniemi. Now maintained by perl5 porters.
	3679
	3680	=cut
	3681
	3682	1;