perl5.git.perl.org Git - perl5.git/blame_incremental

... / ...

Commit	Line	Data
	1	package charnames;
	2	use strict;
	3	use warnings;
	4	use Carp;
	5	our $VERSION = '1.01';
	6
	7	use bytes (); # for $bytes::hint_bits
	8	$charnames::hint_bits = 0x20000;
	9
	10	my %alias1 = (
	11	# Icky 3.2 names with parentheses.
	12	'LINE FEED' => 'LINE FEED (LF)',
	13	'FORM FEED' => 'FORM FEED (FF)',
	14	'CARRIAGE RETURN' => 'CARRIAGE RETURN (CR)',
	15	'NEXT LINE' => 'NEXT LINE (NEL)',
	16	# Convenience.
	17	'LF' => 'LINE FEED (LF)',
	18	'FF' => 'FORM FEED (FF)',
	19	'CR' => 'CARRIAGE RETURN (LF)',
	20	'NEL' => 'NEXT LINE (NEL)',
	21	# More convenience. For futher convencience,
	22	# it is suggested some way using using the NamesList
	23	# aliases is implemented.
	24	'ZWNJ' => 'ZERO WIDTH NON-JOINER',
	25	'ZWJ' => 'ZERO WIDTH JOINER',
	26	'BOM' => 'BYTE ORDER MARK',
	27	);
	28
	29	my %alias2 = (
	30	# Pre-3.2 compatibility (only for the first 256 characters).
	31	'HORIZONTAL TABULATION' => 'CHARACTER TABULATION',
	32	'VERTICAL TABULATION' => 'LINE TABULATION',
	33	'FILE SEPARATOR' => 'INFORMATION SEPARATOR FOUR',
	34	'GROUP SEPARATOR' => 'INFORMATION SEPARATOR THREE',
	35	'RECORD SEPARATOR' => 'INFORMATION SEPARATOR TWO',
	36	'UNIT SEPARATOR' => 'INFORMATION SEPARATOR ONE',
	37	'PARTIAL LINE DOWN' => 'PARTIAL LINE FORWARD',
	38	'PARTIAL LINE UP' => 'PARTIAL LINE BACKWARD',
	39	);
	40
	41	my $txt;
	42
	43	# This is not optimized in any way yet
	44	sub charnames
	45	{
	46	my $name = shift;
	47
	48	if (exists $alias1{$name}) {
	49	$name = $alias1{$name};
	50	}
	51	if (exists $alias2{$name}) {
	52	require warnings;
	53	warnings::warnif('deprecated', qq{Unicode character name "$name" is deprecated, use "$alias2{$name}" instead});
	54	$name = $alias2{$name};
	55	}
	56
	57	my $ord;
	58	my @off;
	59	my $fname;
	60
	61	if ($name eq "BYTE ORDER MARK") {
	62	$fname = $name;
	63	$ord = 0xFEFF;
	64	} else {
	65	## Suck in the code/name list as a big string.
	66	## Lines look like:
	67	## "0052\t\tLATIN CAPITAL LETTER R\n"
	68	$txt = do "unicore/Name.pl" unless $txt;
	69
	70	## @off will hold the index into the code/name string of the start and
	71	## end of the name as we find it.
	72
	73	## If :full, look for the the name exactly
	74	if ($^H{charnames_full} and $txt =~ /\t\t\Q$name\E$/m) {
	75	@off = ($-[0], $+[0]);
	76	}
	77
	78	## If we didn't get above, and :short allowed, look for the short name.
	79	## The short name is like "greek:Sigma"
	80	unless (@off) {
	81	if ($^H{charnames_short} and $name =~ /^(.+?):(.+)/s) {
	82	my ($script, $cname) = ($1,$2);
	83	my $case = ( $cname =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
	84	if ($txt =~ m/\t\t\U$script\E (?:$case )?LETTER \U\Q$cname\E$/m) {
	85	@off = ($-[0], $+[0]);
	86	}
	87	}
	88	}
	89
	90	## If we still don't have it, check for the name among the loaded
	91	## scripts.
	92	if (not @off)
	93	{
	94	my $case = ( $name =~ /[[:upper:]]/ ? "CAPITAL" : "SMALL");
	95	for my $script ( @{$^H{charnames_scripts}} )
	96	{
	97	if ($txt =~ m/\t\t$script (?:$case )?LETTER \U\Q$name\E$/m) {
	98	@off = ($-[0], $+[0]);
	99	last;
	100	}
	101	}
	102	}
	103
	104	## If we don't have it by now, give up.
	105	unless (@off) {
	106	carp "Unknown charname '$name'";
	107	return "\x{FFFD}";
	108	}
	109
	110	##
	111	## Now know where in the string the name starts.
	112	## The code, in hex, is before that.
	113	##
	114	## The code can be 4-6 characters long, so we've got to sort of
	115	## go look for it, just after the newline that comes before $off[0].
	116	##
	117	## This would be much easier if unicore/Name.pl had info in
	118	## a name/code order, instead of code/name order.
	119	##
	120	## The +1 after the rindex() is to skip past the newline we're finding,
	121	## or, if the rindex() fails, to put us to an offset of zero.
	122	##
	123	my $hexstart = rindex($txt, "\n", $off[0]) + 1;
	124
	125	## we know where it starts, so turn into number -
	126	## the ordinal for the char.
	127	$ord = hex substr($txt, $hexstart, $off[0] - $hexstart);
	128	}
	129
	130	if ($^H & $bytes::hint_bits) { # "use bytes" in effect?
	131	use bytes;
	132	return chr $ord if $ord <= 255;
	133	my $hex = sprintf "%04x", $ord;
	134	if (not defined $fname) {
	135	$fname = substr $txt, $off[0] + 2, $off[1] - $off[0] - 2;
	136	}
	137	croak "Character 0x$hex with name '$fname' is above 0xFF";
	138	}
	139
	140	no warnings 'utf8'; # allow even illegal characters
	141	return pack "U", $ord;
	142	}
	143
	144	sub import
	145	{
	146	shift; ## ignore class name
	147
	148	if (not @_)
	149	{
	150	carp("`use charnames' needs explicit imports list");
	151	}
	152	$^H \|= $charnames::hint_bits;
	153	$^H{charnames} = \&charnames ;
	154
	155	##
	156	## fill %h keys with our @_ args.
	157	##
	158	my %h;
	159	@h{@_} = (1) x @_;
	160
	161	$^H{charnames_full} = delete $h{':full'};
	162	$^H{charnames_short} = delete $h{':short'};
	163	$^H{charnames_scripts} = [map uc, keys %h];
	164
	165	##
	166	## If utf8? warnings are enabled, and some scripts were given,
	167	## see if at least we can find one letter of each script.
	168	##
	169	if (warnings::enabled('utf8') && @{$^H{charnames_scripts}})
	170	{
	171	$txt = do "unicore/Name.pl" unless $txt;
	172
	173	for my $script (@{$^H{charnames_scripts}})
	174	{
	175	if (not $txt =~ m/\t\t$script (?:CAPITAL \|SMALL )?LETTER /) {
	176	warnings::warn('utf8', "No such script: '$script'");
	177	}
	178	}
	179	}
	180	}
	181
	182	require Unicode::UCD; # for Unicode::UCD::_getcode()
	183
	184	my %viacode;
	185
	186	sub viacode
	187	{
	188	if (@_ != 1) {
	189	carp "charnames::viacode() expects one argument";
	190	return ()
	191	}
	192
	193	my $arg = shift;
	194	my $code = Unicode::UCD::_getcode($arg);
	195
	196	my $hex;
	197
	198	if (defined $code) {
	199	$hex = sprintf "%04X", $arg;
	200	} else {
	201	carp("unexpected arg \"$arg\" to charnames::viacode()");
	202	return;
	203	}
	204
	205	if ($code > 0x10FFFF) {
	206	carp "Unicode characters only allocated up to 0x10FFFF (you asked for $hex)";
	207	return;
	208	}
	209
	210	return $viacode{$hex} if exists $viacode{$hex};
	211
	212	$txt = do "unicore/Name.pl" unless $txt;
	213
	214	if ($txt =~ m/^$hex\t\t(.+)/m) {
	215	return $viacode{$hex} = $1;
	216	} else {
	217	return;
	218	}
	219	}
	220
	221	my %vianame;
	222
	223	sub vianame
	224	{
	225	if (@_ != 1) {
	226	carp "charnames::vianame() expects one name argument";
	227	return ()
	228	}
	229
	230	my $arg = shift;
	231
	232	return chr hex $1 if $arg =~ /^U\+([0-9a-fA-F]+)$/;
	233
	234	return $vianame{$arg} if exists $vianame{$arg};
	235
	236	$txt = do "unicore/Name.pl" unless $txt;
	237
	238	if ($txt =~ m/^([0-9A-F]+)\t\t($arg)/m) {
	239	return $vianame{$arg} = hex $1;
	240	} else {
	241	return;
	242	}
	243	}
	244
	245
	246	1;
	247	__END__
	248
	249	=head1 NAME
	250
	251	charnames - define character names for C<\N{named}> string literal escapes
	252
	253	=head1 SYNOPSIS
	254
	255	use charnames ':full';
	256	print "\N{GREEK SMALL LETTER SIGMA} is called sigma.\n";
	257
	258	use charnames ':short';
	259	print "\N{greek:Sigma} is an upper-case sigma.\n";
	260
	261	use charnames qw(cyrillic greek);
	262	print "\N{sigma} is Greek sigma, and \N{be} is Cyrillic b.\n";
	263
	264	print charnames::viacode(0x1234); # prints "ETHIOPIC SYLLABLE SEE"
	265	printf "%04X", charnames::vianame("GOTHIC LETTER AHSA"); # prints "10330"
	266
	267	=head1 DESCRIPTION
	268
	269	Pragma C<use charnames> supports arguments C<:full>, C<:short> and
	270	script names. If C<:full> is present, for expansion of
	271	C<\N{CHARNAME}> string C<CHARNAME> is first looked in the list of
	272	standard Unicode names of chars. If C<:short> is present, and
	273	C<CHARNAME> has the form C<SCRIPT:CNAME>, then C<CNAME> is looked up
	274	as a letter in script C<SCRIPT>. If pragma C<use charnames> is used
	275	with script name arguments, then for C<\N{CHARNAME}> the name
	276	C<CHARNAME> is looked up as a letter in the given scripts (in the
	277	specified order).
	278
	279	For lookup of C<CHARNAME> inside a given script C<SCRIPTNAME>
	280	this pragma looks for the names
	281
	282	SCRIPTNAME CAPITAL LETTER CHARNAME
	283	SCRIPTNAME SMALL LETTER CHARNAME
	284	SCRIPTNAME LETTER CHARNAME
	285
	286	in the table of standard Unicode names. If C<CHARNAME> is lowercase,
	287	then the C<CAPITAL> variant is ignored, otherwise the C<SMALL> variant
	288	is ignored.
	289
	290	Note that C<\N{...}> is compile-time, it's a special form of string
	291	constant used inside double-quoted strings: in other words, you cannot
	292	use variables inside the C<\N{...}>. If you want similar run-time
	293	functionality, use charnames::vianame().
	294
	295	For the C0 and C1 control characters (U+0000..U+001F, U+0080..U+009F)
	296	as of Unicode 3.1, there are no official Unicode names but you can use
	297	instead the ISO 6429 names (LINE FEED, ESCAPE, and so forth). In
	298	Unicode 3.2 (as of Perl 5.8) some naming changes take place ISO 6429
	299	has been updated, see L</ALIASES>. Also note that the U+UU80, U+0081,
	300	U+0084, and U+0099 do not have names even in ISO 6429.
	301
	302	Since the Unicode standard uses "U+HHHH", so can you: "\N{U+263a}"
	303	is the Unicode smiley face, or "\N{WHITE SMILING FACE}".
	304
	305	=head1 CUSTOM TRANSLATORS
	306
	307	The mechanism of translation of C<\N{...}> escapes is general and not
	308	hardwired into F<charnames.pm>. A module can install custom
	309	translations (inside the scope which C<use>s the module) with the
	310	following magic incantation:
	311
	312	use charnames (); # for $charnames::hint_bits
	313	sub import {
	314	shift;
	315	$^H \|= $charnames::hint_bits;
	316	$^H{charnames} = \&translator;
	317	}
	318
	319	Here translator() is a subroutine which takes C<CHARNAME> as an
	320	argument, and returns text to insert into the string instead of the
	321	C<\N{CHARNAME}> escape. Since the text to insert should be different
	322	in C<bytes> mode and out of it, the function should check the current
	323	state of C<bytes>-flag as in:
	324
	325	use bytes (); # for $bytes::hint_bits
	326	sub translator {
	327	if ($^H & $bytes::hint_bits) {
	328	return bytes_translator(@_);
	329	}
	330	else {
	331	return utf8_translator(@_);
	332	}
	333	}
	334
	335	=head1 charnames::viacode(code)
	336
	337	Returns the full name of the character indicated by the numeric code.
	338	The example
	339
	340	print charnames::viacode(0x2722);
	341
	342	prints "FOUR TEARDROP-SPOKED ASTERISK".
	343
	344	Returns undef if no name is known for the code.
	345
	346	This works only for the standard names, and does not yet apply
	347	to custom translators.
	348
	349	Notice that the name returned for of U+FEFF is "ZERO WIDTH NO-BREAK
	350	SPACE", not "BYTE ORDER MARK".
	351
	352	=head1 charnames::vianame(name)
	353
	354	Returns the code point indicated by the name.
	355	The example
	356
	357	printf "%04X", charnames::vianame("FOUR TEARDROP-SPOKED ASTERISK");
	358
	359	prints "2722".
	360
	361	Returns undef if the name is unknown.
	362
	363	This works only for the standard names, and does not yet apply
	364	to custom translators.
	365
	366	=head1 ALIASES
	367
	368	A few aliases have been defined for convenience: instead of having
	369	to use the official names
	370
	371	LINE FEED (LF)
	372	FORM FEED (FF)
	373	CARRIAGE RETURN (CR)
	374	NEXT LINE (NEL)
	375
	376	(yes, with parentheses) one can use
	377
	378	LINE FEED
	379	FORM FEED
	380	CARRIAGE RETURN
	381	NEXT LINE
	382	LF
	383	FF
	384	CR
	385	NEL
	386
	387	One can also use
	388
	389	BYTE ORDER MARK
	390	BOM
	391
	392	and
	393
	394	ZWNJ
	395	ZWJ
	396
	397	for ZERO WIDTH NON-JOINER and ZERO WIDTH JOINER.
	398
	399	For backward compatibility one can use the old names for
	400	certain C0 and C1 controls
	401
	402	old new
	403
	404	HORIZONTAL TABULATION CHARACTER TABULATION
	405	VERTICAL TABULATION LINE TABULATION
	406	FILE SEPARATOR INFORMATION SEPARATOR FOUR
	407	GROUP SEPARATOR INFORMATION SEPARATOR THREE
	408	RECORD SEPARATOR INFORMATION SEPARATOR TWO
	409	UNIT SEPARATOR INFORMATION SEPARATOR ONE
	410	PARTIAL LINE DOWN PARTIAL LINE FORWARD
	411	PARTIAL LINE UP PARTIAL LINE BACKWARD
	412
	413	but the old names in addition to giving the character
	414	will also give a warning about being deprecated.
	415
	416	=head1 ILLEGAL CHARACTERS
	417
	418	If you ask for a character that does not exist, a warning is given
	419	and the Unicode I<replacement character> "\x{FFFD}" is returned.
	420
	421	=head1 BUGS
	422
	423	Since evaluation of the translation function happens in a middle of
	424	compilation (of a string literal), the translation function should not
	425	do any C<eval>s or C<require>s. This restriction should be lifted in
	426	a future version of Perl.
	427
	428	=cut