perl5.git.perl.org Git - perl5.git/blame

Commit	Line	Data
657b208b	1	package bytes;
5bc28da9	2
ba6f05db TR	3	use strict;
	4	use warnings;
	5
	6	our $VERSION = '1.08';
b75c8c73	7
d5448623 GS	8	$bytes::hint_bits = 0x00000008;
d5448623 GS	9
5bc28da9	10	sub import {
d5448623	11	$^H \|= $bytes::hint_bits;
5bc28da9 NIS	12	}
	13
	14	sub unimport {
d5448623	15	$^H &= ~$bytes::hint_bits;
5bc28da9 NIS	16	}
5bc28da9 NIS	17
ba6f05db	18	our $AUTOLOAD;
5bc28da9	19	sub AUTOLOAD {
657b208b	20	require "bytes_heavy.pl";
5b5a256a TS	21	goto &$AUTOLOAD if defined &$AUTOLOAD;
	22	require Carp;
	23	Carp::croak("Undefined subroutine $AUTOLOAD called");
5bc28da9 NIS	24	}
5bc28da9 NIS	25
79077e6c RGS	26	sub length (_);
	27	sub chr (_);
	28	sub ord (_);
579f6b36 JH	29	sub substr ($$;$$);
	30	sub index ($$;$);
	31	sub rindex ($$;$);
5bc28da9 NIS	32
	33	1;
	34	__END__
	35
	36	=head1 NAME
	37
01e331e5	38	bytes - Perl pragma to expose the individual bytes of characters
5bc28da9	39
490aa361	40	=head1 NOTICE
a515200d	41
01e331e5 KW	42	Because the bytes pragma breaks encapsulation (i.e. it exposes the innards of
01e331e5 KW	43	how the perl executable currently happens to store a string), the byte values
06a2b43f JH	44	that result are in an unspecified encoding.
	45
	46	B<Use of this module for anything other than debugging purposes is
	47	strongly discouraged.> If you feel that the functions here within
	48	might be useful for your application, this possibly indicates a
	49	mismatch between your mental model of Perl Unicode and the current
01e331e5 KW	50	reality. In that case, you may wish to read some of the perl Unicode
	51	documentation: L<perluniintro>, L<perlunitut>, L<perlunifaq> and
	52	L<perlunicode>.
a515200d	53
5bc28da9 NIS	54	=head1 SYNOPSIS
5bc28da9 NIS	55
657b208b	56	use bytes;
579f6b36 JH	57	... chr(...); # or bytes::chr
	58	... index(...); # or bytes::index
	59	... length(...); # or bytes::length
	60	... ord(...); # or bytes::ord
	61	... rindex(...); # or bytes::rindex
	62	... substr(...); # or bytes::substr
657b208b	63	no bytes;
5bc28da9	64
579f6b36	65
5bc28da9 NIS	66	=head1 DESCRIPTION
5bc28da9 NIS	67
01e331e5 KW	68	Perl's characters are stored internally as sequences of one or more bytes.
	69	This pragma allows for the examination of the individual bytes that together
	70	comprise a character.
	71
	72	Originally the pragma was designed for the loftier goal of helping incorporate
	73	Unicode into Perl, but the approach that used it was found to be defective,
	74	and the one remaining legitimate use is for debugging when you need to
	75	non-destructively examine characters' individual bytes. Just insert this
	76	pragma temporarily, and remove it after the debugging is finished.
	77
	78	The original usage can be accomplished by explicit (rather than this pragma's
ab473f03	79	implicit) encoding using the L<Encode> module:
01e331e5 KW	80
	81	use Encode qw/encode/;
	82
	83	my $utf8_byte_string = encode "UTF8", $string;
	84	my $latin1_byte_string = encode "Latin1", $string;
	85
	86	Or, if performance is needed and you are only interested in the UTF-8
	87	representation:
	88
01e331e5	89	utf8::encode(my $utf8_byte_string = $string);
393fec97	90
01e331e5 KW	91	C<no bytes> can be used to reverse the effect of C<use bytes> within the
01e331e5 KW	92	current lexical scope.
5de28535 SC	93
5de28535 SC	94	As an example, when Perl sees C<$x = chr(400)>, it encodes the character
01e331e5	95	in UTF-8 and stores it in C<$x>. Then it is marked as character data, so,
5de28535	96	for instance, C<length $x> returns C<1>. However, in the scope of the
01e331e5	97	C<bytes> pragma, C<$x> is treated as a series of bytes - the bytes that make
5de28535 SC	98	up the UTF8 encoding - and C<length $x> returns C<2>:
5de28535 SC	99
01e331e5 KW	100	$x = chr(400);
	101	print "Length is ", length $x, "\n"; # "Length is 1"
	102	printf "Contents are %vd\n", $x; # "Contents are 400"
	103	{
	104	use bytes; # or "require bytes; bytes::length()"
	105	print "Length is ", length $x, "\n"; # "Length is 2"
	106	printf "Contents are %vd\n", $x; # "Contents are 198.144 (on
	107	# ASCII platforms)"
	108	}
5de28535	109
01e331e5	110	C<chr()>, C<ord()>, C<substr()>, C<index()> and C<rindex()> behave similarly.
579f6b36	111
01e331e5	112	For more on the implications, see L<perluniintro> and L<perlunicode>.
579f6b36	113
06a2b43f JH	114	C<bytes::length()> is admittedly handy if you need to know the
	115	B<byte length> of a Perl scalar. But a more modern way is:
	116
	117	use Encode 'encode';
	118	length(encode('UTF-8', $scalar))
	119
579f6b36 JH	120	=head1 LIMITATIONS
579f6b36 JH	121
01e331e5	122	C<bytes::substr()> does not work as an I<lvalue()>.
393fec97 GS	123
	124	=head1 SEE ALSO
	125
01e331e5	126	L<perluniintro>, L<perlunicode>, L<utf8>, L<Encode>
5bc28da9 NIS	127
5bc28da9 NIS	128	=cut