2 # $Id: Encode.pm,v 3.07 2020/07/25 12:59:10 dankogai Exp $
7 use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
10 $VERSION = sprintf "%d.%02d", q$Revision: 3.07 $ =~ /(\d+)/g;
12 XSLoader::load( __PACKAGE__, $VERSION );
15 use Exporter 5.57 'import';
18 our @CARP_NOT = qw(Encode::Encoder);
20 # Public, encouraged API is exported by default
23 decode decode_utf8 encode encode_utf8 str2bytes bytes2str
24 encodings find_encoding find_mime_encoding clone_encoding
27 DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
28 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
31 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
32 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
36 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
37 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
39 @FB_FLAGS, @FB_CONSTS,
43 all => [ @EXPORT, @EXPORT_OK ],
44 default => [ @EXPORT ],
45 fallbacks => [ @FB_CONSTS ],
46 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
49 # Documentation moved after __END__ for speed - NI-S
51 our $ON_EBCDIC = ( ord("A") == 193 );
54 use Encode::MIME::Name;
58 # Make a %Encoding package variable to allow a certain amount of cheating
61 require Encode::Config;
63 # https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
64 # to find why sig handlers inside eval{} are disabled.
68 local @INC = @INC || ();
69 pop @INC if $INC[-1] eq '.';
70 require Encode::ConfigLocal;
75 my $arg = $_[1] || '';
76 if ( $arg eq ":all" ) {
77 %enc = ( %Encoding, %ExtModule );
81 for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
83 for my $enc ( keys %ExtModule ) {
84 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
88 return sort { lc $a cmp lc $b }
89 grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
93 my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
94 $obj->can("perlio_ok") and return $obj->perlio_ok();
95 return 0; # safety net
101 $Encoding{$name} = $obj;
103 define_alias( $lc => $obj ) unless $lc eq $name;
106 define_alias( $alias, $obj );
108 my $class = ref($obj);
109 push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT;
110 push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT;
115 my ( $class, $name, $skip_external ) = @_;
117 defined($name) or return;
119 $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
121 ref($name) && $name->can('renew') and return $name;
122 exists $Encoding{$name} and return $Encoding{$name};
124 exists $Encoding{$lc} and return $Encoding{$lc};
126 my $oc = $class->find_alias($name);
127 defined($oc) and return $oc;
128 $lc ne $name and $oc = $class->find_alias($lc);
129 defined($oc) and return $oc;
131 unless ($skip_external) {
132 if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
135 eval { require $mod; };
136 exists $Encoding{$name} and return $Encoding{$name};
142 # HACK: These two functions must be defined in Encode and because of
143 # cyclic dependency between Encode and Encode::Alias, Exporter does not work
145 goto &Encode::Alias::find_alias;
148 goto &Encode::Alias::define_alias;
151 sub find_encoding($;$) {
152 my ( $name, $skip_external ) = @_;
153 return __PACKAGE__->getEncoding( $name, $skip_external );
156 sub find_mime_encoding($;$) {
157 my ( $mime_name, $skip_external ) = @_;
158 my $name = Encode::MIME::Name::get_encode_name( $mime_name );
159 return find_encoding( $name, $skip_external );
162 sub resolve_alias($) {
163 my $obj = find_encoding(shift);
164 defined $obj and return $obj->name;
168 sub clone_encoding($) {
169 my $obj = find_encoding(shift);
171 return Storable::dclone($obj);
177 package Encode::UTF_EBCDIC;
178 use parent 'Encode::Encoding';
179 my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
180 Encode::define_encoding($obj, 'Unicode');
182 my ( undef, $str, $chk ) = @_;
184 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
187 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
194 my ( undef, $str, $chk ) = @_;
196 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
199 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
206 package Encode::Internal;
207 use parent 'Encode::Encoding';
208 my $obj = bless { Name => "Internal" } => "Encode::Internal";
209 Encode::define_encoding($obj, 'Unicode');
211 my ( undef, $str, $chk ) = @_;
220 # https://rt.cpan.org/Public/Bug/Display.html?id=103253
222 use parent 'Encode::Encoding';
226 package Encode::utf8;
227 use parent 'Encode::Encoding';
229 'utf8' => { Name => 'utf8' },
230 'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 }
233 bless $obj{$_} => __PACKAGE__;
234 Encode::define_encoding( $obj{$_} => $_ );
237 # ($obj, $dst, $src, $pos, $trm, $chk)
238 # currently ignores $chk
239 my ( undef, undef, undef, $pos, $trm ) = @_;
240 my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
242 if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
244 substr( $$rsrc, $pos, $npos - $pos + length($trm) );
245 $$rpos = $npos + length($trm);
248 $$rdst .= substr( $$rsrc, $pos );
249 $$rpos = length($$rsrc);
260 Encode - character encodings in Perl
264 use Encode qw(decode encode);
265 $characters = decode('UTF-8', $octets, Encode::FB_CROAK);
266 $octets = encode('UTF-8', $characters, Encode::FB_CROAK);
268 =head2 Table of Contents
270 Encode consists of a collection of modules whose details are too extensive
271 to fit in one document. This one itself explains the top-level APIs
272 and general topics at a glance. For other topics and more details,
273 see the documentation for these modules:
277 =item L<Encode::Alias> - Alias definitions to encodings
279 =item L<Encode::Encoding> - Encode Implementation Base Class
281 =item L<Encode::Supported> - List of Supported Encodings
283 =item L<Encode::CN> - Simplified Chinese Encodings
285 =item L<Encode::JP> - Japanese Encodings
287 =item L<Encode::KR> - Korean Encodings
289 =item L<Encode::TW> - Traditional Chinese Encodings
295 The C<Encode> module provides the interface between Perl strings
296 and the rest of the system. Perl strings are sequences of
299 The repertoire of characters that Perl can represent is a superset of those
300 defined by the Unicode Consortium. On most platforms the ordinal
301 values of a character as returned by C<ord(I<S>)> is the I<Unicode
302 codepoint> for that character. The exceptions are platforms where
303 the legacy encoding is some variant of EBCDIC rather than a superset
304 of ASCII; see L<perlebcdic>.
306 During recent history, data is moved around a computer in 8-bit chunks,
307 often called "bytes" but also known as "octets" in standards documents.
308 Perl is widely used to manipulate data of many types: not only strings of
309 characters representing human or computer languages, but also "binary"
310 data, being the machine's representation of numbers, pixels in an image, or
313 When Perl is processing "binary data", the programmer wants Perl to
314 process "sequences of bytes". This is not a problem for Perl: because a
315 byte has 256 possible values, it easily fits in Perl's much larger
318 This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
325 A character in the range 0 .. 2**32-1 (or more);
326 what Perl's strings are made of.
330 A character in the range 0..255;
331 a special case of a Perl character.
335 8 bits of data, with ordinal values 0..255;
336 term for bytes passed to or from a non-Perl context, such as a disk file,
337 standard I/O stream, database, command-line argument, environment variable,
340 =head1 THE PERL ENCODING API
346 $octets = encode(ENCODING, STRING[, CHECK])
348 Encodes the scalar value I<STRING> from Perl's internal form into
349 I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a
350 canonical name or an alias. For encoding names and aliases, see
351 L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">.
353 B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending
354 on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
357 For example, to convert a string from Perl's internal format into
358 ISO-8859-1, also known as Latin1:
360 $octets = encode("iso-8859-1", $string);
362 B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then
363 $octets I<might not be equal to> $string. Though both contain the
364 same data, the UTF8 flag for $octets is I<always> off. When you
365 encode anything, the UTF8 flag on the result is always off, even when it
366 contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below.
368 If the $string is C<undef>, then C<undef> is returned.
370 C<str2bytes> may be used as an alias for C<encode>.
374 $string = decode(ENCODING, OCTETS[, CHECK])
376 This function returns the string that results from decoding the scalar
377 value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
378 Perl's internal form. As with encode(),
379 I<ENCODING> can be either a canonical name or an alias. For encoding names
380 and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
383 B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending
384 on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
387 For example, to convert ISO-8859-1 data into a string in Perl's
390 $string = decode("iso-8859-1", $octets);
392 B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string
393 I<might not be equal to> $octets. Though both contain the same data, the
394 UTF8 flag for $string is on. See L</"The UTF8 flag">
397 If the $string is C<undef>, then C<undef> is returned.
399 C<bytes2str> may be used as an alias for C<decode>.
403 [$obj =] find_encoding(ENCODING)
405 Returns the I<encoding object> corresponding to I<ENCODING>. Returns
406 C<undef> if no matching I<ENCODING> is find. The returned object is
407 what does the actual encoding or decoding.
409 $string = decode($name, $bytes);
414 $obj = find_encoding($name);
415 croak qq(encoding "$name" not found) unless ref $obj;
416 $obj->decode($bytes);
419 with more error checking.
421 You can therefore save time by reusing this object as follows;
423 my $enc = find_encoding("iso-8859-1");
425 my $string = $enc->decode($_);
426 ... # now do something with $string;
429 Besides L</decode> and L</encode>, other methods are
430 available as well. For instance, C<name()> returns the canonical
431 name of the encoding object.
433 find_encoding("latin1")->name; # iso-8859-1
435 See L<Encode::Encoding> for details.
437 =head3 find_mime_encoding
439 [$obj =] find_mime_encoding(MIME_ENCODING)
441 Returns the I<encoding object> corresponding to I<MIME_ENCODING>. Acts
442 same as C<find_encoding()> but C<mime_name()> of returned object must
443 match to I<MIME_ENCODING>. So as opposite of C<find_encoding()>
444 canonical names and aliases are not used when searching for object.
446 find_mime_encoding("utf8"); # returns undef because "utf8" is not valid I<MIME_ENCODING>
447 find_mime_encoding("utf-8"); # returns encode object "utf-8-strict"
448 find_mime_encoding("UTF-8"); # same as "utf-8" because I<MIME_ENCODING> is case insensitive
449 find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not valid I<MIME_ENCODING>
453 [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
455 Converts I<in-place> data between two encodings. The data in $octets
456 must be encoded as octets and I<not> as characters in Perl's internal
457 format. For example, to convert ISO-8859-1 data into Microsoft's CP1250
460 from_to($octets, "iso-8859-1", "cp1250");
462 and to convert it back:
464 from_to($octets, "cp1250", "iso-8859-1");
466 Because the conversion happens in place, the data to be
467 converted cannot be a string constant: it must be a scalar variable.
469 C<from_to()> returns the length of the converted string in octets on success,
470 and C<undef> on error.
472 B<CAVEAT>: The following operations may look the same, but are not:
474 from_to($data, "iso-8859-1", "UTF-8"); #1
475 $data = decode("iso-8859-1", $data); #2
477 Both #1 and #2 make $data consist of a completely valid UTF-8 string,
478 but only #2 turns the UTF8 flag on. #1 is equivalent to:
480 $data = encode("UTF-8", decode("iso-8859-1", $data));
482 See L</"The UTF8 flag"> below.
486 from_to($octets, $from, $to, $check);
490 $octets = encode($to, decode($from, $octets), $check);
492 Yes, it does I<not> respect the $check during decoding. It is
493 deliberately done that way. If you need minute control, use C<decode>
494 followed by C<encode> as follows:
496 $octets = encode($to, decode($from, $octets, $check_from), $check_to);
500 $octets = encode_utf8($string);
502 Equivalent to C<$octets = encode("utf8", $string)>. The characters in
503 $string are encoded in Perl's internal format, and the result is returned
504 as a sequence of octets. Because all possible characters in Perl have a
505 (loose, not strict) utf8 representation, this function cannot fail.
507 B<WARNING>: do not use this function for data exchange as it can produce
508 not strict utf8 $octets! For strictly valid UTF-8 output use
509 C<$octets = encode("UTF-8", $string)>.
513 $string = decode_utf8($octets [, CHECK]);
515 Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
516 The sequence of octets represented by $octets is decoded
517 from (loose, not strict) utf8 into a sequence of logical characters.
518 Because not all sequences of octets are valid not strict utf8,
519 it is quite possible for this function to fail.
520 For CHECK, see L</"Handling Malformed Data">.
522 B<WARNING>: do not use this function for data exchange as it can produce
523 $string with not strict utf8 representation! For strictly valid UTF-8
524 $string representation use C<$string = decode("UTF-8", $octets [, CHECK])>.
526 B<CAVEAT>: the input I<$octets> might be modified in-place depending on
527 what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
530 =head2 Listing available encodings
533 @list = Encode->encodings();
535 Returns a list of canonical names of available encodings that have already
536 been loaded. To get a list of all available encodings including those that
537 have not yet been loaded, say:
539 @all_encodings = Encode->encodings(":all");
541 Or you can give the name of a specific module:
543 @with_jp = Encode->encodings("Encode::JP");
545 When "C<::>" is not in the name, "C<Encode::>" is assumed.
547 @ebcdic = Encode->encodings("EBCDIC");
549 To find out in detail which encodings are supported by this package,
550 see L<Encode::Supported>.
552 =head2 Defining Aliases
554 To add a new alias to a given encoding, use:
558 define_alias(NEWNAME => ENCODING);
560 After that, I<NEWNAME> can be used as an alias for I<ENCODING>.
561 I<ENCODING> may be either the name of an encoding or an
564 Before you do that, first make sure the alias is nonexistent using
565 C<resolve_alias()>, which returns the canonical name thereof.
568 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
569 Encode::resolve_alias("iso-8859-12") # false; nonexistent
570 Encode::resolve_alias($name) eq $name # true if $name is canonical
572 C<resolve_alias()> does not need C<use Encode::Alias>; it can be
573 imported via C<use Encode qw(resolve_alias)>.
575 See L<Encode::Alias> for details.
577 =head2 Finding IANA Character Set Registry names
579 The canonical name of a given encoding does not necessarily agree with
580 IANA Character Set Registry, commonly seen as C<< Content-Type:
581 text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name
582 works, but sometimes it does not, most notably with "utf-8-strict".
584 As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
587 my $enc = find_encoding("UTF-8");
588 warn $enc->name; # utf-8-strict
589 warn $enc->mime_name; # UTF-8
591 See also: L<Encode::Encoding>
593 =head1 Encoding via PerlIO
595 If your perl supports C<PerlIO> (which is the default), you can use a
596 C<PerlIO> layer to decode and encode directly via a filehandle. The
597 following two examples are fully identical in functionality:
599 ### Version 1 via PerlIO
600 open(INPUT, "< :encoding(shiftjis)", $infile)
601 || die "Can't open < $infile for reading: $!";
602 open(OUTPUT, "> :encoding(euc-jp)", $outfile)
603 || die "Can't open > $output for writing: $!";
604 while (<INPUT>) { # auto decodes $_
605 print OUTPUT; # auto encodes $_
607 close(INPUT) || die "can't close $infile: $!";
608 close(OUTPUT) || die "can't close $outfile: $!";
610 ### Version 2 via from_to()
611 open(INPUT, "< :raw", $infile)
612 || die "Can't open < $infile for reading: $!";
613 open(OUTPUT, "> :raw", $outfile)
614 || die "Can't open > $output for writing: $!";
617 from_to($_, "shiftjis", "euc-jp", 1); # switch encoding
618 print OUTPUT; # emit raw (but properly encoded) data
620 close(INPUT) || die "can't close $infile: $!";
621 close(OUTPUT) || die "can't close $outfile: $!";
623 In the first version above, you let the appropriate encoding layer
624 handle the conversion. In the second, you explicitly translate
625 from one encoding to the other.
627 Unfortunately, it may be that encodings are not C<PerlIO>-savvy. You can check
628 to see whether your encoding is supported by C<PerlIO> by invoking the
629 C<perlio_ok> method on it:
631 Encode::perlio_ok("hz"); # false
632 find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available
634 use Encode qw(perlio_ok); # imported upon request
637 Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
638 except for C<hz> and C<ISO-2022-kr>. For the gory details, see
639 L<Encode::Encoding> and L<Encode::PerlIO>.
641 =head1 Handling Malformed Data
643 The optional I<CHECK> argument tells C<Encode> what to do when
644 encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT>
647 As of version 2.12, C<Encode> supports coderef values for C<CHECK>;
650 B<NOTE:> Not all encodings support this feature.
651 Some encodings ignore the I<CHECK> argument. For example,
652 L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
654 =head2 List of I<CHECK> values
658 I<CHECK> = Encode::FB_DEFAULT ( == 0)
660 If I<CHECK> is 0, encoding and decoding replace any malformed character
661 with a I<substitution character>. When you encode, I<SUBCHAR> is used.
662 When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
663 used. If the data is supposed to be UTF-8, an optional lexical warning of
664 warning category C<"utf8"> is given.
668 I<CHECK> = Encode::FB_CROAK ( == 1)
670 If I<CHECK> is 1, methods immediately die with an error
671 message. Therefore, when I<CHECK> is 1, you should trap
672 exceptions with C<eval{}>, unless you really want to let it C<die>.
676 I<CHECK> = Encode::FB_QUIET
678 If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
679 return the portion of the data that has been processed so far when an
680 error occurs. The data argument is overwritten with everything
681 after that point; that is, the unprocessed portion of the data. This is
682 handy when you have to call C<decode> repeatedly in the case where your
683 source data may contain partial multi-byte character sequences,
684 (that is, you are reading with a fixed-width buffer). Here's some sample
685 code to do exactly that:
687 my($buffer, $string) = ("", "");
688 while (read($fh, $buffer, 256, length($buffer))) {
689 $string .= decode($encoding, $buffer, Encode::FB_QUIET);
690 # $buffer now contains the unprocessed partial character
695 I<CHECK> = Encode::FB_WARN
697 This is the same as C<FB_QUIET> above, except that instead of being silent
698 on errors, it issues a warning. This is handy for when you are debugging.
700 B<CAVEAT>: All warnings from Encode module are reported, independently of
701 L<pragma warnings|warnings> settings. If you want to follow settings of
702 lexical warnings configured by L<pragma warnings|warnings> then append
703 also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available
704 since Encode version 2.99.
706 =head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
710 =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
712 =item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
714 =item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
718 For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
719 C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
721 When you decode, C<\xI<HH>> is inserted for a malformed character, where
722 I<HH> is the hex representation of the octet that could not be decoded to
723 utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
724 the Unicode code point (in any number of hex digits) of the character that
725 cannot be found in the character repertoire of the encoding.
727 The HTML/XML character reference modes are about the same. In place of
728 C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
729 XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
731 In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
735 These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>>
736 constants are laid out. You can import the C<FB_I<XXX>> constants via
737 C<use Encode qw(:fallbacks)>, and you can import the generic bitmask
738 constants via C<use Encode qw(:fallback_all)>.
740 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
743 RETURN_ON_ERR 0x0004 X X
753 If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
754 source string to encode() or decode() will be overwritten in place.
755 If you're not interested in this, then bitwise-OR it with the bitmask.
757 =head2 coderef for CHECK
759 As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
760 ordinal value of the unmapped character as an argument and returns
761 octets that represent the fallback character. For instance:
763 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
765 Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
767 Fallback for C<decode> must return decoded string (sequence of characters)
768 and takes a list of ordinal values as its arguments. So for
769 example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
770 a fallback for bytes that are not valid UTF-8, you could write
772 $str = decode 'UTF-8', $octets, sub {
773 my $tmp = join '', map chr, @_;
774 return decode 'ISO-8859-15', $tmp;
777 =head1 Defining Encodings
779 To define a new encoding, use:
781 use Encode qw(define_encoding);
782 define_encoding($object, CANONICAL_NAME [, alias...]);
784 I<CANONICAL_NAME> will be associated with I<$object>. The object
785 should provide the interface described in L<Encode::Encoding>.
786 If more than two arguments are provided, additional
787 arguments are considered aliases for I<$object>.
789 See L<Encode::Encoding> for details.
793 Before the introduction of Unicode support in Perl, The C<eq> operator
794 just compared the strings represented by two scalars. Beginning with
795 Perl 5.8, C<eq> compares two strings with simultaneous consideration of
796 I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
797 I<Programming Perl, 3rd ed.>
803 Old byte-oriented programs should not spontaneously break on the old
804 byte-oriented data they used to work on.
808 Old byte-oriented programs should magically start working on the new
809 character-oriented data when appropriate.
813 Programs should run just as fast in the new character-oriented mode
814 as in the old byte-oriented mode.
818 Perl should remain one language, rather than forking into a
819 byte-oriented Perl and a character-oriented Perl.
823 When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
824 born yet, many features documented in the book remained unimplemented for a
825 long time. Perl 5.8 corrected much of this, and the introduction of the
826 UTF8 flag is one of them. You can think of there being two fundamentally
827 different kinds of strings and string-operations in Perl: one a
828 byte-oriented mode for when the internal UTF8 flag is off, and the other a
829 character-oriented mode for when the internal UTF8 flag is on.
831 This UTF8 flag is not visible in Perl scripts, exactly for the same reason
832 you cannot (or rather, you I<don't have to>) see whether a scalar contains
833 a string, an integer, or a floating-point number. But you can still peek
834 and poke these if you will. See the next section.
836 =head2 Messing with Perl's Internals
838 The following API uses parts of Perl's internals in the current
839 implementation. As such, they are efficient but may change in a future
844 is_utf8(STRING [, CHECK])
846 [INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
847 If I<CHECK> is true, also checks whether I<STRING> contains well-formed
848 UTF-8. Returns true if successful, false otherwise.
850 Typically only necessary for debugging and testing. Don't use this flag as
851 a marker to distinguish character and binary data, that should be decided
852 for each variable when you write your code.
854 B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that
855 I<STRING> is UTF-8 encoded and vice-versa.
857 As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
863 [INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING>
864 is I<not> checked for containing only well-formed UTF-8. Do not use this
865 unless you I<know with absolute certainty> that the STRING holds only
866 well-formed UTF-8. Returns the previous state of the UTF8 flag (so please
867 don't treat the return value as indicating success or failure), or C<undef>
868 if I<STRING> is not a string.
870 B<NOTE>: For security reasons, this function does not work on tainted values.
876 [INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use
877 frivolously. Returns the previous state of the UTF8 flag, or C<undef> if
878 I<STRING> is not a string. Do not treat the return value as indicative of
879 success or failure, because that isn't what it means: it is only the
882 B<NOTE>: For security reasons, this function does not work on tainted values.
884 =head1 UTF-8 vs. utf8 vs. UTF8
886 ....We now view strings not as sequences of bytes, but as sequences
887 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
888 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
890 That has historically been Perl's notion of UTF-8, as that is how UTF-8 was
891 first conceived by Ken Thompson when he invented it. However, thanks to
892 later revisions to the applicable standards, official UTF-8 is now rather
893 stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
894 to cover only 21 bits instead of 32 or 64 bits) and some sequences
895 are not allowed, like those used in surrogate pairs, the 31 non-character
896 code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
897 (0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
899 The former default in which Perl would always use a loose interpretation of
900 UTF-8 has now been overruled:
902 From: Larry Wall <larry@wall.org>
903 Date: December 04, 2004 11:51:58 JST
904 To: perl-unicode@perl.org
905 Subject: Re: Make Encode.pm support the real UTF-8
906 Message-Id: <20041204025158.GA28754@wall.org>
908 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
909 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
910 : but "UTF-8" is the name of the standard and should give the
911 : corresponding behaviour.
913 For what it's worth, that's how I've always kept them straight in my
916 Also for what it's worth, Perl 6 will mostly default to strict but
917 make it easy to switch back to lax.
921 Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
922 sense, which is conservative and strict and security-conscious, whereas
923 B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
924 lax. C<Encode> version 2.10 or later thus groks this subtle but critically
925 important distinction between C<"UTF-8"> and C<"utf8">.
927 encode("utf8", "\x{FFFF_FFFF}", 1); # okay
928 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
930 In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
931 C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is
932 critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
934 find_encoding("UTF-8")->name # is 'utf-8-strict'
935 find_encoding("utf-8")->name # ditto. names are case insensitive
936 find_encoding("utf_8")->name # ditto. "_" are treated as "-"
937 find_encoding("UTF8")->name # is 'utf8'.
939 Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
940 whether a string is internally encoded as "utf8", also without a hyphen.
945 L<Encode::Supported>,
950 L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
952 the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
956 This project was originated by the late Nick Ing-Simmons and later
957 maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS
958 for a full list of people involved. For any questions, send mail to
959 I<< <perl-unicode@perl.org> >> so that we can all share.
961 While Dan Kogai retains the copyright as a maintainer, credit
962 should go to all those involved. See AUTHORS for a list of those
963 who submitted code to the project.
967 Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>.
969 This library is free software; you can redistribute it and/or modify
970 it under the same terms as Perl itself.