This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
new perldelta
[perl5.git] / lib / bytes.pm
CommitLineData
657b208b 1package bytes;
5bc28da9 2
ba6f05db
TR
3use strict;
4use warnings;
5
6our $VERSION = '1.08';
b75c8c73 7
d5448623
GS
8$bytes::hint_bits = 0x00000008;
9
5bc28da9 10sub import {
d5448623 11 $^H |= $bytes::hint_bits;
5bc28da9
NIS
12}
13
14sub unimport {
d5448623 15 $^H &= ~$bytes::hint_bits;
5bc28da9
NIS
16}
17
ba6f05db 18our $AUTOLOAD;
5bc28da9 19sub AUTOLOAD {
657b208b 20 require "bytes_heavy.pl";
5b5a256a
TS
21 goto &$AUTOLOAD if defined &$AUTOLOAD;
22 require Carp;
23 Carp::croak("Undefined subroutine $AUTOLOAD called");
5bc28da9
NIS
24}
25
79077e6c
RGS
26sub length (_);
27sub chr (_);
28sub ord (_);
579f6b36
JH
29sub substr ($$;$$);
30sub index ($$;$);
31sub rindex ($$;$);
5bc28da9
NIS
32
331;
34__END__
35
36=head1 NAME
37
01e331e5 38bytes - Perl pragma to expose the individual bytes of characters
5bc28da9 39
490aa361 40=head1 NOTICE
a515200d 41
01e331e5
KW
42Because the bytes pragma breaks encapsulation (i.e. it exposes the innards of
43how the perl executable currently happens to store a string), the byte values
06a2b43f
JH
44that result are in an unspecified encoding.
45
46B<Use of this module for anything other than debugging purposes is
47strongly discouraged.> If you feel that the functions here within
48might be useful for your application, this possibly indicates a
49mismatch between your mental model of Perl Unicode and the current
01e331e5
KW
50reality. In that case, you may wish to read some of the perl Unicode
51documentation: L<perluniintro>, L<perlunitut>, L<perlunifaq> and
52L<perlunicode>.
a515200d 53
5bc28da9
NIS
54=head1 SYNOPSIS
55
657b208b 56 use bytes;
579f6b36
JH
57 ... chr(...); # or bytes::chr
58 ... index(...); # or bytes::index
59 ... length(...); # or bytes::length
60 ... ord(...); # or bytes::ord
61 ... rindex(...); # or bytes::rindex
62 ... substr(...); # or bytes::substr
657b208b 63 no bytes;
5bc28da9 64
579f6b36 65
5bc28da9
NIS
66=head1 DESCRIPTION
67
01e331e5
KW
68Perl's characters are stored internally as sequences of one or more bytes.
69This pragma allows for the examination of the individual bytes that together
70comprise a character.
71
72Originally the pragma was designed for the loftier goal of helping incorporate
73Unicode into Perl, but the approach that used it was found to be defective,
74and the one remaining legitimate use is for debugging when you need to
75non-destructively examine characters' individual bytes. Just insert this
76pragma temporarily, and remove it after the debugging is finished.
77
78The original usage can be accomplished by explicit (rather than this pragma's
ab473f03 79implicit) encoding using the L<Encode> module:
01e331e5
KW
80
81 use Encode qw/encode/;
82
83 my $utf8_byte_string = encode "UTF8", $string;
84 my $latin1_byte_string = encode "Latin1", $string;
85
86Or, if performance is needed and you are only interested in the UTF-8
87representation:
88
01e331e5 89 utf8::encode(my $utf8_byte_string = $string);
393fec97 90
01e331e5
KW
91C<no bytes> can be used to reverse the effect of C<use bytes> within the
92current lexical scope.
5de28535
SC
93
94As an example, when Perl sees C<$x = chr(400)>, it encodes the character
01e331e5 95in UTF-8 and stores it in C<$x>. Then it is marked as character data, so,
5de28535 96for instance, C<length $x> returns C<1>. However, in the scope of the
01e331e5 97C<bytes> pragma, C<$x> is treated as a series of bytes - the bytes that make
5de28535
SC
98up the UTF8 encoding - and C<length $x> returns C<2>:
99
01e331e5
KW
100 $x = chr(400);
101 print "Length is ", length $x, "\n"; # "Length is 1"
102 printf "Contents are %vd\n", $x; # "Contents are 400"
103 {
104 use bytes; # or "require bytes; bytes::length()"
105 print "Length is ", length $x, "\n"; # "Length is 2"
106 printf "Contents are %vd\n", $x; # "Contents are 198.144 (on
107 # ASCII platforms)"
108 }
5de28535 109
01e331e5 110C<chr()>, C<ord()>, C<substr()>, C<index()> and C<rindex()> behave similarly.
579f6b36 111
01e331e5 112For more on the implications, see L<perluniintro> and L<perlunicode>.
579f6b36 113
06a2b43f
JH
114C<bytes::length()> is admittedly handy if you need to know the
115B<byte length> of a Perl scalar. But a more modern way is:
116
117 use Encode 'encode';
118 length(encode('UTF-8', $scalar))
119
579f6b36
JH
120=head1 LIMITATIONS
121
01e331e5 122C<bytes::substr()> does not work as an I<lvalue()>.
393fec97
GS
123
124=head1 SEE ALSO
125
01e331e5 126L<perluniintro>, L<perlunicode>, L<utf8>, L<Encode>
5bc28da9
NIS
127
128=cut