perl5.git.perl.org Git - perl5.git/blame

Commit	Line	Data
423cee85	1	package charnames;
b177ca84 JF	2	use strict;
b177ca84 JF	3	use warnings;
280e9a5c	4	our $VERSION = '1.50';
a03f0b9f	5	use unicore::Name; # mktables-generated algorithmically-defined names
e7a078a0	6	use _charnames (); # The submodule for this where most of the work gets done
b75c8c73	7
52fb7278	8	use bytes (); # for $bytes::hint_bits
123148a1	9	use re "/aa"; # Everything in here should be ASCII
423cee85	10
38f4139d	11	# Translate between Unicode character names and their code points.
e7a078a0 KW	12	# This is a wrapper around the submodule C<_charnames>. This design allows
	13	# C<_charnames> to be autoloaded to enable use of \N{...}, but requires this
	14	# module to be explicitly requested for the functions API.
b177ca84	15
889a6fe0	16	$Carp::Internal{ (__PACKAGE__) } = 1;
63098191	17
b177ca84 JF	18	sub import
	19	{
	20	shift; ## ignore class name
e7a078a0 KW	21	_charnames->import(@_);
e7a078a0 KW	22	}
423cee85	23
84374e30 KW	24	# Cache of already looked-up values. This is set to only contain
	25	# official values, and user aliases can't override them, so scoping is
	26	# not an issue.
	27	my %viacode;
63098191 KW	28
63098191 KW	29	sub viacode {
e7a078a0 KW	30	return _charnames::viacode(@_);
e7a078a0 KW	31	}
daf0d493 JH	32
	33	sub vianame
	34	{
35c0985d	35	if (@_ != 1) {
e7a078a0	36	_charnames::carp "charnames::vianame() expects one name argument";
35c0985d MB	37	return ()
35c0985d MB	38	}
daf0d493	39
63098191 KW	40	# Looks up the character name and returns its ordinal if
63098191 KW	41	# found, undef otherwise.
daf0d493	42
63098191	43	my $arg = shift;
a39c5dfb	44	return () unless length $arg;
dbc0d4f2	45
63098191	46	if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
4e2cda5d	47
fb121860 KW	48	# khw claims that this is poor interface design. The function should
	49	# return either a an ord or a chr for all inputs; not be bipolar. But
	50	# can't change it because of backward compatibility. New code can use
	51	# string_vianame() instead.
5a7fb30a	52	my $ord = CORE::hex $1;
2c9cc169 KW	53	return chr utf8::unicode_to_native($ord) if $ord <= 255
2c9cc169 KW	54	\|\| ! ((caller 0)[8] & $bytes::hint_bits);
e7a078a0	55	_charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord);
5a7fb30a	56	return;
63098191	57	}
daf0d493	58
fb121860 KW	59	# The first 1 arg means wants an ord returned; the second that we are in
fb121860 KW	60	# runtime, and this is the first level routine called from the user
e7a078a0	61	return _charnames::lookup_name($arg, 1, 1);
35c0985d	62	} # vianame
b177ca84	63
fb121860 KW	64	sub string_vianame {
	65
	66	# Looks up the character name and returns its string representation if
	67	# found, undef otherwise.
	68
	69	if (@_ != 1) {
e7a078a0	70	_charnames::carp "charnames::string_vianame() expects one name argument";
fb121860 KW	71	return;
	72	}
	73
	74	my $arg = shift;
a39c5dfb	75	return () unless length $arg;
fb121860 KW	76
	77	if ($arg =~ /^U\+([0-9a-fA-F]+)$/) {
	78
	79	my $ord = CORE::hex $1;
2c9cc169 KW	80	return chr utf8::unicode_to_native($ord) if $ord <= 255
2c9cc169 KW	81	\|\| ! ((caller 0)[8] & $bytes::hint_bits);
fb121860	82
e7a078a0	83	_charnames::carp _charnames::not_legal_use_bytes_msg($arg, chr $ord);
fb121860 KW	84	return;
	85	}
	86
	87	# The 0 arg means wants a string returned; the 1 arg means that we are in
	88	# runtime, and this is the first level routine called from the user
e7a078a0	89	return _charnames::lookup_name($arg, 0, 1);
fb121860 KW	90	} # string_vianame
fb121860 KW	91
423cee85 JH	92	1;
	93	__END__
	94
bde9e88d KW	95	=encoding utf8
bde9e88d KW	96
423cee85 JH	97	=head1 NAME
423cee85 JH	98
fb121860	99	charnames - access to Unicode character names and named character sequences; also define character names
423cee85 JH	100
	101	=head1 SYNOPSIS
	102
bcc08981 KW	103	use charnames ':full';
	104	print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
	105	print "\N{LATIN CAPITAL LETTER E WITH VERTICAL LINE BELOW}",
	106	" is an officially named sequence of two Unicode characters\n";
	107
38f4139d KW	108	use charnames ':loose';
	109	print "\N{Greek small-letter sigma}",
	110	"can be used to ignore case, underscores, most blanks,"
	111	"and when you aren't sure if the official name has hyphens\n";
	112
bcc08981 KW	113	use charnames ':short';
	114	print "\N{greek:Sigma} is an upper-case sigma.\n";
	115
	116	use charnames qw(cyrillic greek);
	117	print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
	118
bde9e88d	119	use utf8;
bcc08981 KW	120	use charnames ":full", ":alias" => {
	121	e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
	122	mychar => 0xE8000, # Private use area
bde9e88d	123	"自転車に乗る人" => "BICYCLIST"
bcc08981 KW	124	};
bcc08981 KW	125	print "\N{e_ACUTE} is a small letter e with an acute.\n";
14aeae98	126	print "\N{mychar} allows me to name private use characters.\n";
bde9e88d KW	127	print "And I can create synonyms in other languages,",
bde9e88d KW	128	" such as \N{自転車に乗る人} for "BICYCLIST (U+1F6B4)\n";
bcc08981 KW	129
	130	use charnames ();
	131	print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
	132	printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints
	133	# "10330"
	134	print charnames::vianame("LATIN CAPITAL LETTER A"); # prints 65 on
	135	# ASCII platforms;
	136	# 193 on EBCDIC
	137	print charnames::string_vianame("LATIN CAPITAL LETTER A"); # prints "A"
b177ca84	138
423cee85 JH	139	=head1 DESCRIPTION
423cee85 JH	140
da9dec57	141	Pragma C<use charnames> is used to gain access to the names of the
fb121860 KW	142	Unicode characters and named character sequences, and to allow you to define
	143	your own character and character sequence names.
	144
	145	All forms of the pragma enable use of the following 3 functions:
	146
	147	=over
	148
	149	=item *
	150
	151	L</charnames::string_vianame(I<name>)> for run-time lookup of a
	152	either a character name or a named character sequence, returning its string
	153	representation
	154
	155	=item *
	156
	157	L</charnames::vianame(I<name>)> for run-time lookup of a
	158	character name (but not a named character sequence) to get its ordinal value
	159	(code point)
da9dec57	160
fb121860	161	=item *
da9dec57	162
fb121860 KW	163	L</charnames::viacode(I<code>)> for run-time lookup of a code point to get its
	164	Unicode name.
	165
	166	=back
	167
1f3b4888	168	Starting in Perl v5.16, any occurrence of C<\N{I<CHARNAME>}> sequences
fbb93542 KW	169	in a double-quotish string automatically loads this module with arguments
	170	C<:full> and C<:short> (described below) if it hasn't already been loaded with
	171	different arguments, in order to compile the named Unicode character into
1f3b4888 KW	172	position in the string. Prior to v5.16, an explicit S<C<use charnames>> was
1f3b4888 KW	173	required to enable this usage. (However, prior to v5.16, the form C<S<"use
fbb93542	174	charnames ();">> did not enable C<\N{I<CHARNAME>}>.)
da9dec57 KW	175
da9dec57 KW	176	Note that C<\N{U+I<...>}>, where the I<...> is a hexadecimal number,
fbb93542	177	also inserts a character into a string.
22bd7dd2	178	The character it inserts is the one whose Unicode code point
da9dec57	179	(ordinal value) is equal to the number. For example, C<"\N{U+263a}"> is
fbb93542 KW	180	the Unicode (white background, black foreground) smiley face
fbb93542 KW	181	equivalent to C<"\N{WHITE SMILING FACE}">.
d9f23c72	182	Also note, C<\N{I<...>}> can mean a regex quantifier instead of a character
8ebef31d KW	183	name, when the I<...> is a number (or comma separated pair of numbers
8ebef31d KW	184	(see L<perlreref/QUANTIFIERS>), and is not related to this pragma.
da9dec57	185
38f4139d KW	186	The C<charnames> pragma supports arguments C<:full>, C<:loose>, C<:short>,
	187	script names and L<customized aliases\|/CUSTOM ALIASES>.
	188
	189	If C<:full> is present, for expansion of
da9dec57	190	C<\N{I<CHARNAME>}>, the string I<CHARNAME> is first looked up in the list of
38f4139d KW	191	standard Unicode character names.
	192
	193	C<:loose> is a variant of C<:full> which allows I<CHARNAME> to be less
	194	precisely specified. Details are in L</LOOSE MATCHES>.
	195
	196	If C<:short> is present, and
da9dec57	197	I<CHARNAME> has the form C<I<SCRIPT>:I<CNAME>>, then I<CNAME> is looked up
14aeae98 KW	198	as a letter in script I<SCRIPT>, as described in the next paragraph.
14aeae98 KW	199	Or, if C<use charnames> is used
da9dec57 KW	200	with script name arguments, then for C<\N{I<CHARNAME>}> the name
da9dec57 KW	201	I<CHARNAME> is looked up as a letter in the given scripts (in the
16036bcd KW	202	specified order). Customized aliases can override these, and are explained in
16036bcd KW	203	L</CUSTOM ALIASES>.
423cee85	204
1f3b4888	205	For lookup of I<CHARNAME> inside a given script I<SCRIPTNAME>,
14aeae98	206	this pragma looks in the table of standard Unicode names for the names
423cee85 JH	207
	208	SCRIPTNAME CAPITAL LETTER CHARNAME
	209	SCRIPTNAME SMALL LETTER CHARNAME
	210	SCRIPTNAME LETTER CHARNAME
	211
14aeae98	212	If I<CHARNAME> is all lowercase,
daf0d493	213	then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
14aeae98	214	is ignored, and both I<CHARNAME> and I<SCRIPTNAME> are converted to all
38f4139d KW	215	uppercase for look-up. Other than that, both of them follow L<loose\|/LOOSE
38f4139d KW	216	MATCHES> rules if C<:loose> is also specified; strict otherwise.
daf0d493	217
da9dec57 KW	218	Note that C<\N{...}> is compile-time; it's a special form of string
da9dec57 KW	219	constant used inside double-quotish strings; this means that you cannot
4e2cda5d	220	use variables inside the C<\N{...}>. If you want similar run-time
fb121860 KW	221	functionality, use
fb121860 KW	222	L<charnames::string_vianame()\|/charnames::string_vianame(I<name>)>.
423cee85	223
67db75e3 KW	224	Note, starting in Perl 5.18, the name C<BELL> refers to the Unicode character
	225	U+1F514, instead of the traditional U+0007. For the latter, use C<ALERT>
	226	or C<BEL>.
301a3cda	227
90249f0a	228	It is a syntax error to use C<\N{NAME}> where C<NAME> is unknown.
e5432b89	229
8ebef31d KW	230	For C<\N{NAME}>, it is a fatal error if C<use bytes> is in effect and the
	231	input name is that of a character that won't fit into a byte (i.e., whose
	232	ordinal is above 255).
e5432b89	233
da9dec57	234	Otherwise, any string that includes a C<\N{I<charname>}> or
850b7ec9	235	C<S<\N{U+I<code point>}>> will automatically have Unicode rules (see
da9dec57 KW	236	L<perlunicode/Byte and Character Semantics>).
da9dec57 KW	237
38f4139d KW	238	=head1 LOOSE MATCHES
	239
	240	By specifying C<:loose>, Unicode's L<loose character name
5ef88e32	241	matching\|http://www.unicode.org/reports/tr44#Matching_Rules> rules are
38f4139d KW	242	selected instead of the strict exact match used otherwise.
	243	That means that I<CHARNAME> doesn't have to be so precisely specified.
	244	Upper/lower case doesn't matter (except with scripts as mentioned above), nor
	245	do any underscores, and the only hyphens that matter are those at the
	246	beginning or end of a word in the name (with one exception: the hyphen in
	247	U+1180 C<HANGUL JUNGSEONG O-E> does matter).
	248	Also, blanks not adjacent to hyphens don't matter.
	249	The official Unicode names are quite variable as to where they use hyphens
	250	versus spaces to separate word-like units, and this option allows you to not
	251	have to care as much.
	252	The reason non-medial hyphens matter is because of cases like
	253	U+0F60 C<TIBETAN LETTER -A> versus U+0F68 C<TIBETAN LETTER A>.
	254	The hyphen here is significant, as is the space before it, and so both must be
	255	included.
	256
	257	C<:loose> slows down look-ups by a factor of 2 to 3 versus
	258	C<:full>, but the trade-off may be worth it to you. Each individual look-up
	259	takes very little time, and the results are cached, so the speed difference
	260	would become a factor only in programs that do look-ups of many different
67db75e3 KW	261	spellings, and probably only when those look-ups are through C<vianame()> and
67db75e3 KW	262	C<string_vianame()>, since C<\N{...}> look-ups are done at compile time.
38f4139d	263
5ffe0e96	264	=head1 ALIASES
423cee85	265
7620cb10 KW	266	Starting in Unicode 6.1 and Perl v5.16, Unicode defines many abbreviations and
	267	names that were formerly Perl extensions, and some additional ones that Perl
	268	did not previously accept. The list is getting too long to reproduce here,
	269	but you can get the complete list from the Unicode web site:
	270	L<http://www.unicode.org/Public/UNIDATA/NameAliases.txt>.
	271
	272	Earlier versions of Perl accepted almost all the 6.1 names. These were most
	273	extensively documented in the v5.14 version of this pod:
	274	L<http://perldoc.perl.org/5.14.0/charnames.html#ALIASES>.
16036bcd	275
35c0985d MB	276	=head1 CUSTOM ALIASES
35c0985d MB	277
1f31fcd4 KW	278	You can add customized aliases to standard (C<:full>) Unicode naming
1f31fcd4 KW	279	conventions. The aliases override any standard definitions, so, if
da9dec57 KW	280	you're twisted enough, you can change C<"\N{LATIN CAPITAL LETTER A}"> to
da9dec57 KW	281	mean C<"B">, etc.
55bc7d3c	282
bde9e88d	283	Aliases must begin with a character that is alphabetic. After that, each may
558de9fa	284	contain any combination of word (C<\w>) characters, SPACE (U+0020),
754e15cf KW	285	HYPHEN-MINUS (U+002D), LEFT PARENTHESIS (U+0028), and RIGHT PARENTHESIS
	286	(U+0029). These last two should never have been allowed
	287	in names, and are retained for backwards compatibility only, and may be
bde9e88d KW	288	deprecated and removed in future releases of Perl, so don't use them for new
	289	names. (More precisely, the first character of a name you specify must be
	290	something that matches all of C<\p{ID_Start}>, C<\p{Alphabetic}>, and
	291	C<\p{Gc=Letter}>. This makes sure it is what any reasonable person would view
558de9fa KW	292	as an alphabetic character. And, the continuation characters that match C<\w>
558de9fa KW	293	must also match C<\p{ID_Continue}>.) Starting with Perl v5.18, any Unicode
bde9e88d KW	294	characters meeting the above criteria may be used; prior to that only
bde9e88d KW	295	Latin1-range characters were acceptable.
e5432b89	296
38f4139d KW	297	An alias can map to either an official Unicode character name (not a loose
38f4139d KW	298	matched name) or to a
e5432b89 KW	299	numeric code point (ordinal). The latter is useful for assigning names
e5432b89 KW	300	to code points in Unicode private use areas such as U+E800 through
f12d74c0	301	U+F8FF.
055bf491	302	A numeric code point must be a non-negative integer, or a string beginning
f12d74c0 KW	303	with C<"U+"> or C<"0x"> with the remainder considered to be a
	304	hexadecimal integer. A literal numeric constant must be unsigned; it
	305	will be interpreted as hex if it has a leading zero or contains
	306	non-decimal hex digits; otherwise it will be interpreted as decimal.
22bd7dd2 KW	307	If it begins with C<"U+">, it is interpreted as the Unicode code point;
	308	otherwise it is interpreted as native. (Only code points below 256 can
	309	differ between Unicode and native.) Thus C<U+41> is always the Latin letter
	310	"A"; but C<0x41> can be "NO-BREAK SPACE" on EBCDIC platforms.
232cbbee	311
da9dec57	312	Aliases are added either by the use of anonymous hashes:
35c0985d	313
da9dec57	314	use charnames ":alias" => {
35c0985d	315	e_ACUTE => "LATIN SMALL LETTER E WITH ACUTE",
232cbbee	316	mychar1 => 0xE8000,
35c0985d MB	317	};
	318	my $str = "\N{e_ACUTE}";
	319
da9dec57	320	or by using a file containing aliases:
35c0985d	321
da9dec57	322	use charnames ":alias" => "pro";
35c0985d	323
8ebef31d	324	This will try to read C<"unicore/pro_alias.pl"> from the C<@INC> path. This
da9dec57	325	file should return a list in plain perl:
35c0985d MB	326
	327	(
	328	A_GRAVE => "LATIN CAPITAL LETTER A WITH GRAVE",
	329	A_CIRCUM => "LATIN CAPITAL LETTER A WITH CIRCUMFLEX",
	330	A_DIAERES => "LATIN CAPITAL LETTER A WITH DIAERESIS",
	331	A_TILDE => "LATIN CAPITAL LETTER A WITH TILDE",
	332	A_BREVE => "LATIN CAPITAL LETTER A WITH BREVE",
	333	A_RING => "LATIN CAPITAL LETTER A WITH RING ABOVE",
	334	A_MACRON => "LATIN CAPITAL LETTER A WITH MACRON",
f12d74c0	335	mychar2 => "U+E8001",
35c0985d MB	336	);
35c0985d MB	337
da9dec57 KW	338	Both these methods insert C<":full"> automatically as the first argument (if no
	339	other argument is given), and you can give the C<":full"> explicitly as
	340	well, like
35c0985d	341
da9dec57	342	use charnames ":full", ":alias" => "pro";
35c0985d	343
38f4139d KW	344	C<":loose"> has no effect with these. Input names must match exactly, using
	345	C<":full"> rules.
	346
14aeae98	347	Also, both these methods currently allow only single characters to be named.
8ebef31d KW	348	To name a sequence of characters, use a
	349	L<custom translator\|/CUSTOM TRANSLATORS> (described below).
	350
228e8c7b KW	351	=head1 charnames::string_vianame(I<name>)
	352
	353	This is a runtime equivalent to C<\N{...}>. I<name> can be any expression
	354	that evaluates to a name accepted by C<\N{...}> under the L<C<:full>
	355	option\|/DESCRIPTION> to C<charnames>. In addition, any other options for the
	356	controlling C<"use charnames"> in the same scope apply, like C<:loose> or any
	357	L<script list, C<:short> option\|/DESCRIPTION>, or L<custom aliases\|/CUSTOM
	358	ALIASES> you may have defined.
	359
0fe83d7d KW	360	The only differences are due to the fact that C<string_vianame> is run-time
	361	and C<\N{}> is compile time. You can't interpolate inside a C<\N{}>, (so
	362	C<\N{$variable}> doesn't work); and if the input name is unknown,
	363	C<string_vianame> returns C<undef> instead of it being a syntax error.
228e8c7b KW	364
	365	=head1 charnames::vianame(I<name>)
	366
	367	This is similar to C<string_vianame>. The main difference is that under most
2f8114fb	368	circumstances, C<vianame> returns an ordinal code
228e8c7b KW	369	point, whereas C<string_vianame> returns a string. For example,
	370
	371	printf "U+%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
	372
	373	prints "U+2722".
	374
	375	This leads to the other two differences. Since a single code point is
	376	returned, the function can't handle named character sequences, as these are
	377	composed of multiple characters (it returns C<undef> for these. And, the code
	378	point can be that of any
	379	character, even ones that aren't legal under the C<S<use bytes>> pragma,
	380
	381	See L</BUGS> for the circumstances in which the behavior differs
	382	from that described above.
	383
da9dec57	384	=head1 charnames::viacode(I<code>)
b177ca84 JF	385
b177ca84 JF	386	Returns the full name of the character indicated by the numeric code.
da9dec57	387	For example,
b177ca84 JF	388
	389	print charnames::viacode(0x2722);
	390
	391	prints "FOUR TEARDROP-SPOKED ASTERISK".
	392
f6067adc KW	393	The name returned is the "best" (defined below) official name or alias
f6067adc KW	394	for the code point, if
ffec6758 KW	395	available; otherwise your custom alias for it, if defined; otherwise C<undef>.
	396	This means that your alias will only be returned for code points that don't
	397	have an official Unicode name (nor alias) such as private use code points.
7620cb10	398
da9dec57 KW	399	If you define more than one name for the code point, it is indeterminate
	400	which one will be returned.
	401
ffec6758	402	As mentioned, the function returns C<undef> if no name is known for the code
67db75e3	403	point. In Unicode the proper name for these is the empty string, which
da9dec57 KW	404	C<undef> stringifies to. (If you ask for a code point past the legal
da9dec57 KW	405	Unicode maximum of U+10FFFF that you haven't assigned an alias to, you
f12d74c0 KW	406	get C<undef> plus a warning.)
f12d74c0 KW	407
1f3b4888	408	The input number must be a non-negative integer, or a string beginning
f12d74c0 KW	409	with C<"U+"> or C<"0x"> with the remainder considered to be a
	410	hexadecimal integer. A literal numeric constant must be unsigned; it
	411	will be interpreted as hex if it has a leading zero or contains
	412	non-decimal hex digits; otherwise it will be interpreted as decimal.
22bd7dd2 KW	413	If it begins with C<"U+">, it is interpreted as the Unicode code point;
	414	otherwise it is interpreted as native. (Only code points below 256 can
	415	differ between Unicode and native.) Thus C<U+41> is always the Latin letter
	416	"A"; but C<0x41> can be "NO-BREAK SPACE" on EBCDIC platforms.
daf0d493	417
f6067adc KW	418	As mentioned above under L</ALIASES>, Unicode 6.1 defines extra names
	419	(synonyms or aliases) for some code points, most of which were already
	420	available as Perl extensions. All these are accepted by C<\N{...}> and the
	421	other functions in this module, but C<viacode> has to choose which one
	422	name to return for a given input code point, so it returns the "best" name.
	423	To understand how this works, it is helpful to know more about the Unicode
	424	name properties. All code points actually have only a single name, which
	425	(starting in Unicode 2.0) can never change once a character has been assigned
	426	to the code point. But mistakes have been made in assigning names, for
	427	example sometimes a clerical error was made during the publishing of the
	428	Standard which caused words to be misspelled, and there was no way to correct
	429	those. The Name_Alias property was eventually created to handle these
	430	situations. If a name was wrong, a corrected synonym would be published for
	431	it, using Name_Alias. C<viacode> will return that corrected synonym as the
	432	"best" name for a code point. (It is even possible, though it hasn't happened
	433	yet, that the correction itself will need to be corrected, and so another
	434	Name_Alias can be created for that code point; C<viacode> will return the
	435	most recent correction.)
	436
	437	The Unicode name for each of the control characters (such as LINE FEED) is the
	438	empty string. However almost all had names assigned by other standards, such
	439	as the ASCII Standard, or were in common use. C<viacode> returns these names
	440	as the "best" ones available. Unicode 6.1 has created Name_Aliases for each
	441	of them, including alternate names, like NEW LINE. C<viacode> uses the
	442	original name, "LINE FEED" in preference to the alternate. Similarly the
	443	name returned for U+FEFF is "ZERO WIDTH NO-BREAK SPACE", not "BYTE ORDER
	444	MARK".
	445
	446	Until Unicode 6.1, the 4 control characters U+0080, U+0081, U+0084, and U+0099
	447	did not have names nor aliases.
	448	To preserve backwards compatibility, any alias you define for these code
	449	points will be returned by this function, in preference to the official name.
	450
	451	Some code points also have abbreviated names, such as "LF" or "NL".
	452	C<viacode> never returns these.
	453
	454	Because a name correction may be added in future Unicode releases, the name
	455	that C<viacode> returns may change as a result. This is a rare event, but it
	456	does happen.
274085e3	457
5ffe0e96	458	=head1 CUSTOM TRANSLATORS
52ea3e69	459
5ffe0e96	460	The mechanism of translation of C<\N{...}> escapes is general and not
5ef88e32	461	hardwired into F<charnames.pm>. A module can install custom
5ffe0e96 MB	462	translations (inside the scope which C<use>s the module) with the
5ffe0e96 MB	463	following magic incantation:
52ea3e69	464
5ffe0e96	465	sub import {
52fb7278 KW	466	shift;
52fb7278 KW	467	$^H{charnames} = \&translator;
5ffe0e96	468	}
52ea3e69	469
da9dec57	470	Here translator() is a subroutine which takes I<CHARNAME> as an
5ffe0e96	471	argument, and returns text to insert into the string instead of the
5ef88e32 KW	472	C<\N{I<CHARNAME>}> escape.
	473
	474	This is the only way you can create a custom named sequence of code points.
	475
	476	Since the text to insert should be different
5ffe0e96 MB	477	in C<bytes> mode and out of it, the function should check the current
5ffe0e96 MB	478	state of C<bytes>-flag as in:
52ea3e69	479
52fb7278	480	use bytes (); # for $bytes::hint_bits
5ffe0e96	481	sub translator {
52fb7278 KW	482	if ($^H & $bytes::hint_bits) {
	483	return bytes_translator(@_);
	484	}
	485	else {
	486	return utf8_translator(@_);
	487	}
5ffe0e96	488	}
52ea3e69	489
da9dec57	490	See L</CUSTOM ALIASES> above for restrictions on I<CHARNAME>.
f0175764	491
9e808deb KW	492	Of course, C<vianame>, C<viacode>, and C<string_vianame> would need to be
9e808deb KW	493	overridden as well.
1f31fcd4	494
423cee85 JH	495	=head1 BUGS
423cee85 JH	496
14aeae98	497	vianame() normally returns an ordinal code point, but when the input name is of
8ebef31d KW	498	the form C<U+...>, it returns a chr instead. In this case, if C<use bytes> is
	499	in effect and the character won't fit into a byte, it returns C<undef> and
	500	raises a warning.
55bc7d3c	501
f12d74c0 KW	502	Since evaluation of the translation function (see L</CUSTOM
	503	TRANSLATORS>) happens in the middle of compilation (of a string
	504	literal), the translation function should not do any C<eval>s or
	505	C<require>s. This restriction should be lifted (but is low priority) in
	506	a future version of Perl.
423cee85 JH	507
423cee85 JH	508	=cut
0eacc33e	509
52fb7278	510	# ex: set ts=8 sts=2 sw=2 et: