[perl5.git] / lib / utf8.pm

package utf8;

$utf8::hint_bits = 0x00800000;

sub import {
    $^H |= $utf8::hint_bits;
    $enc{caller()} = $_[1] if $_[1];
}

sub unimport {
    $^H &= ~$utf8::hint_bits;
}

sub AUTOLOAD {
    require "utf8_heavy.pl";
    goto &$AUTOLOAD;
}

1;
__END__

=head1 NAME

utf8 - Perl pragma to enable/disable UTF-8 in source code

=head1 SYNOPSIS

    use utf8;
    no utf8;

=head1 DESCRIPTION

WARNING: The implementation of Unicode support in Perl is incomplete.
See L<perlunicode> for the exact details.

The C<use utf8> pragma tells the Perl parser to allow UTF-8 in the
program text in the current lexical scope.  The C<no utf8> pragma
tells Perl to switch back to treating the source text as literal
bytes in the current lexical scope.

This pragma is primarily a compatibility device.  Perl versions
earlier than 5.6 allowed arbitrary bytes in source code, whereas
in future we would like to standardize on the UTF-8 encoding for
source text.  Until UTF-8 becomes the default format for source
text, this pragma should be used to recognize UTF-8 in the source.
When UTF-8 becomes the standard source format, this pragma will
effectively become a no-op.

Enabling the C<utf8> pragma has the following effects:

=over

=item *

Bytes in the source text that have their high-bit set will be treated
as being part of a literal UTF-8 character.  This includes most literals
such as identifiers, string constants, constant regular expression patterns
and package names.

=item *

In the absence of inputs marked as UTF-8, regular expressions within the
scope of this pragma will default to using character semantics instead
of byte semantics.

    @bytes_or_chars = split //, $data;	# may split to bytes if data
					# $data isn't UTF-8
    {
	use utf8;			# force char semantics
	@chars = split //, $data;	# splits characters
    }

=head1 SEE ALSO

L<perlunicode>, L<bytes>

=cut
Commit	Line	Data
a0ed51b3 LW	1	package utf8;
a0ed51b3 LW	2
d5448623 GS	3	$utf8::hint_bits = 0x00800000;
d5448623 GS	4
a0ed51b3	5	sub import {
d5448623	6	$^H \|= $utf8::hint_bits;
a0ed51b3 LW	7	$enc{caller()} = $_[1] if $_[1];
	8	}
	9
	10	sub unimport {
d5448623	11	$^H &= ~$utf8::hint_bits;
a0ed51b3 LW	12	}
	13
	14	sub AUTOLOAD {
	15	require "utf8_heavy.pl";
	16	goto &$AUTOLOAD;
	17	}
	18
	19	1;
	20	__END__
	21
	22	=head1 NAME
	23
393fec97	24	utf8 - Perl pragma to enable/disable UTF-8 in source code
a0ed51b3 LW	25
	26	=head1 SYNOPSIS
	27
	28	use utf8;
	29	no utf8;
	30
	31	=head1 DESCRIPTION
	32
393fec97	33	WARNING: The implementation of Unicode support in Perl is incomplete.
21bad921	34	See L<perlunicode> for the exact details.
a0ed51b3	35
393fec97 GS	36	The C<use utf8> pragma tells the Perl parser to allow UTF-8 in the
	37	program text in the current lexical scope. The C<no utf8> pragma
	38	tells Perl to switch back to treating the source text as literal
	39	bytes in the current lexical scope.
a0ed51b3	40
393fec97 GS	41	This pragma is primarily a compatibility device. Perl versions
	42	earlier than 5.6 allowed arbitrary bytes in source code, whereas
	43	in future we would like to standardize on the UTF-8 encoding for
	44	source text. Until UTF-8 becomes the default format for source
	45	text, this pragma should be used to recognize UTF-8 in the source.
	46	When UTF-8 becomes the standard source format, this pragma will
	47	effectively become a no-op.
a0ed51b3	48
393fec97	49	Enabling the C<utf8> pragma has the following effects:
a0ed51b3	50
393fec97	51	=over
a0ed51b3 LW	52
	53	=item *
	54
393fec97 GS	55	Bytes in the source text that have their high-bit set will be treated
	56	as being part of a literal UTF-8 character. This includes most literals
	57	such as identifiers, string constants, constant regular expression patterns
	58	and package names.
a0ed51b3 LW	59
	60	=item *
	61
393fec97 GS	62	In the absence of inputs marked as UTF-8, regular expressions within the
	63	scope of this pragma will default to using character semantics instead
	64	of byte semantics.
a0ed51b3	65
393fec97 GS	66	@bytes_or_chars = split //, $data; # may split to bytes if data
	67	# $data isn't UTF-8
	68	{
	69	use utf8; # force char semantics
	70	@chars = split //, $data; # splits characters
a0ed51b3 LW	71	}
a0ed51b3 LW	72
393fec97	73	=head1 SEE ALSO
a0ed51b3	74
8058d7ab	75	L<perlunicode>, L<bytes>
a0ed51b3 LW	76
a0ed51b3 LW	77	=cut