2 # $Id: Encode.pm,v 3.18 2022/06/25 02:04:06 dankogai Exp $
7 use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
10 $VERSION = sprintf "%d.%02d", q$Revision: 3.18 $ =~ /(\d+)/g;
12 XSLoader::load( __PACKAGE__, $VERSION );
15 use Exporter 5.57 'import';
18 our @CARP_NOT = qw(Encode::Encoder);
20 # Public, encouraged API is exported by default
23 decode decode_utf8 encode encode_utf8 str2bytes bytes2str
24 encodings find_encoding find_mime_encoding clone_encoding
27 DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
28 PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
31 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
32 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
36 _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
37 is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
39 @FB_FLAGS, @FB_CONSTS,
43 all => [ @EXPORT, @EXPORT_OK ],
44 default => [ @EXPORT ],
45 fallbacks => [ @FB_CONSTS ],
46 fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
49 # Documentation moved after __END__ for speed - NI-S
51 our $ON_EBCDIC = ( ord("A") == 193 );
54 use Encode::MIME::Name;
58 # Make a %Encoding package variable to allow a certain amount of cheating
61 require Encode::Config;
63 # https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
64 # to find why sig handlers inside eval{} are disabled.
69 pop @INC if @INC && $INC[-1] eq '.';
70 require Encode::ConfigLocal;
75 my $arg = $_[1] || '';
76 if ( $arg eq ":all" ) {
77 %enc = ( %Encoding, %ExtModule );
81 for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
83 for my $enc ( keys %ExtModule ) {
84 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
88 return sort { lc $a cmp lc $b }
89 grep { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
93 my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
94 $obj->can("perlio_ok") and return $obj->perlio_ok();
95 return 0; # safety net
101 $Encoding{$name} = $obj;
103 define_alias( $lc => $obj ) unless $lc eq $name;
106 define_alias( $alias, $obj );
108 my $class = ref($obj);
109 push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT;
110 push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT;
115 my ( $class, $name, $skip_external ) = @_;
117 defined($name) or return;
119 $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
121 ref($name) && $name->can('renew') and return $name;
122 exists $Encoding{$name} and return $Encoding{$name};
124 exists $Encoding{$lc} and return $Encoding{$lc};
126 my $oc = $class->find_alias($name);
127 defined($oc) and return $oc;
128 $lc ne $name and $oc = $class->find_alias($lc);
129 defined($oc) and return $oc;
131 unless ($skip_external) {
132 if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
135 eval { require $mod; };
136 exists $Encoding{$name} and return $Encoding{$name};
142 # HACK: These two functions must be defined in Encode and because of
143 # cyclic dependency between Encode and Encode::Alias, Exporter does not work
145 goto &Encode::Alias::find_alias;
148 goto &Encode::Alias::define_alias;
151 sub find_encoding($;$) {
152 my ( $name, $skip_external ) = @_;
153 return __PACKAGE__->getEncoding( $name, $skip_external );
156 sub find_mime_encoding($;$) {
157 my ( $mime_name, $skip_external ) = @_;
158 my $name = Encode::MIME::Name::get_encode_name( $mime_name );
159 return find_encoding( $name, $skip_external );
162 sub resolve_alias($) {
163 my $obj = find_encoding(shift);
164 defined $obj and return $obj->name;
168 sub clone_encoding($) {
169 my $obj = find_encoding(shift);
171 return Storable::dclone($obj);
177 package Encode::UTF_EBCDIC;
178 use parent 'Encode::Encoding';
179 my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
180 Encode::define_encoding($obj, 'Unicode');
182 my ( undef, $str, $chk ) = @_;
184 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
187 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
194 my ( undef, $str, $chk ) = @_;
196 for ( my $i = 0 ; $i < length($str) ; $i++ ) {
199 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
208 # https://rt.cpan.org/Public/Bug/Display.html?id=103253
210 use parent 'Encode::Encoding';
214 package Encode::utf8;
215 use parent 'Encode::Encoding';
217 'utf8' => { Name => 'utf8' },
218 'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 }
221 bless $obj{$_} => __PACKAGE__;
222 Encode::define_encoding( $obj{$_} => $_ );
225 # ($obj, $dst, $src, $pos, $trm, $chk)
226 # currently ignores $chk
227 my ( undef, undef, undef, $pos, $trm ) = @_;
228 my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
230 if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
232 substr( $$rsrc, $pos, $npos - $pos + length($trm) );
233 $$rpos = $npos + length($trm);
236 $$rdst .= substr( $$rsrc, $pos );
237 $$rpos = length($$rsrc);
248 Encode - character encodings in Perl
252 use Encode qw(decode encode);
253 $characters = decode('UTF-8', $octets, Encode::FB_CROAK);
254 $octets = encode('UTF-8', $characters, Encode::FB_CROAK);
256 =head2 Table of Contents
258 Encode consists of a collection of modules whose details are too extensive
259 to fit in one document. This one itself explains the top-level APIs
260 and general topics at a glance. For other topics and more details,
261 see the documentation for these modules:
265 =item L<Encode::Alias> - Alias definitions to encodings
267 =item L<Encode::Encoding> - Encode Implementation Base Class
269 =item L<Encode::Supported> - List of Supported Encodings
271 =item L<Encode::CN> - Simplified Chinese Encodings
273 =item L<Encode::JP> - Japanese Encodings
275 =item L<Encode::KR> - Korean Encodings
277 =item L<Encode::TW> - Traditional Chinese Encodings
283 The C<Encode> module provides the interface between Perl strings
284 and the rest of the system. Perl strings are sequences of
287 The repertoire of characters that Perl can represent is a superset of those
288 defined by the Unicode Consortium. On most platforms the ordinal
289 values of a character as returned by C<ord(I<S>)> is the I<Unicode
290 codepoint> for that character. The exceptions are platforms where
291 the legacy encoding is some variant of EBCDIC rather than a superset
292 of ASCII; see L<perlebcdic>.
294 During recent history, data is moved around a computer in 8-bit chunks,
295 often called "bytes" but also known as "octets" in standards documents.
296 Perl is widely used to manipulate data of many types: not only strings of
297 characters representing human or computer languages, but also "binary"
298 data, being the machine's representation of numbers, pixels in an image, or
301 When Perl is processing "binary data", the programmer wants Perl to
302 process "sequences of bytes". This is not a problem for Perl: because a
303 byte has 256 possible values, it easily fits in Perl's much larger
306 This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
313 A character in the range 0 .. 2**32-1 (or more);
314 what Perl's strings are made of.
318 A character in the range 0..255;
319 a special case of a Perl character.
323 8 bits of data, with ordinal values 0..255;
324 term for bytes passed to or from a non-Perl context, such as a disk file,
325 standard I/O stream, database, command-line argument, environment variable,
328 =head1 THE PERL ENCODING API
334 $octets = encode(ENCODING, STRING[, CHECK])
336 Encodes the scalar value I<STRING> from Perl's internal form into
337 I<ENCODING> and returns a sequence of octets. I<ENCODING> can be either a
338 canonical name or an alias. For encoding names and aliases, see
339 L</"Defining Aliases">. For CHECK, see L</"Handling Malformed Data">.
341 B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending
342 on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
345 For example, to convert a string from Perl's internal format into
346 ISO-8859-1, also known as Latin1:
348 $octets = encode("iso-8859-1", $string);
350 B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then
351 $octets I<might not be equal to> $string. Though both contain the
352 same data, the UTF8 flag for $octets is I<always> off. When you
353 encode anything, the UTF8 flag on the result is always off, even when it
354 contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below.
356 If the $string is C<undef>, then C<undef> is returned.
358 C<str2bytes> may be used as an alias for C<encode>.
362 $string = decode(ENCODING, OCTETS[, CHECK])
364 This function returns the string that results from decoding the scalar
365 value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
366 Perl's internal form. As with encode(),
367 I<ENCODING> can be either a canonical name or an alias. For encoding names
368 and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
371 B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending
372 on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
375 For example, to convert ISO-8859-1 data into a string in Perl's
378 $string = decode("iso-8859-1", $octets);
380 B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string
381 I<might not be equal to> $octets. Though both contain the same data, the
382 UTF8 flag for $string is on. See L</"The UTF8 flag">
385 If the $string is C<undef>, then C<undef> is returned.
387 C<bytes2str> may be used as an alias for C<decode>.
391 [$obj =] find_encoding(ENCODING)
393 Returns the I<encoding object> corresponding to I<ENCODING>. Returns
394 C<undef> if no matching I<ENCODING> is find. The returned object is
395 what does the actual encoding or decoding.
397 $string = decode($name, $bytes);
402 $obj = find_encoding($name);
403 croak qq(encoding "$name" not found) unless ref $obj;
404 $obj->decode($bytes);
407 with more error checking.
409 You can therefore save time by reusing this object as follows;
411 my $enc = find_encoding("iso-8859-1");
413 my $string = $enc->decode($_);
414 ... # now do something with $string;
417 Besides L</decode> and L</encode>, other methods are
418 available as well. For instance, C<name()> returns the canonical
419 name of the encoding object.
421 find_encoding("latin1")->name; # iso-8859-1
423 See L<Encode::Encoding> for details.
425 =head3 find_mime_encoding
427 [$obj =] find_mime_encoding(MIME_ENCODING)
429 Returns the I<encoding object> corresponding to I<MIME_ENCODING>. Acts
430 same as C<find_encoding()> but C<mime_name()> of returned object must
431 match to I<MIME_ENCODING>. So as opposite of C<find_encoding()>
432 canonical names and aliases are not used when searching for object.
434 find_mime_encoding("utf8"); # returns undef because "utf8" is not valid I<MIME_ENCODING>
435 find_mime_encoding("utf-8"); # returns encode object "utf-8-strict"
436 find_mime_encoding("UTF-8"); # same as "utf-8" because I<MIME_ENCODING> is case insensitive
437 find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not valid I<MIME_ENCODING>
441 [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
443 Converts I<in-place> data between two encodings. The data in $octets
444 must be encoded as octets and I<not> as characters in Perl's internal
445 format. For example, to convert ISO-8859-1 data into Microsoft's CP1250
448 from_to($octets, "iso-8859-1", "cp1250");
450 and to convert it back:
452 from_to($octets, "cp1250", "iso-8859-1");
454 Because the conversion happens in place, the data to be
455 converted cannot be a string constant: it must be a scalar variable.
457 C<from_to()> returns the length of the converted string in octets on success,
458 and C<undef> on error.
460 B<CAVEAT>: The following operations may look the same, but are not:
462 from_to($data, "iso-8859-1", "UTF-8"); #1
463 $data = decode("iso-8859-1", $data); #2
465 Both #1 and #2 make $data consist of a completely valid UTF-8 string,
466 but only #2 turns the UTF8 flag on. #1 is equivalent to:
468 $data = encode("UTF-8", decode("iso-8859-1", $data));
470 See L</"The UTF8 flag"> below.
474 from_to($octets, $from, $to, $check);
478 $octets = encode($to, decode($from, $octets), $check);
480 Yes, it does I<not> respect the $check during decoding. It is
481 deliberately done that way. If you need minute control, use C<decode>
482 followed by C<encode> as follows:
484 $octets = encode($to, decode($from, $octets, $check_from), $check_to);
488 $octets = encode_utf8($string);
490 B<WARNING>: L<This function can produce invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
491 Do not use it for data exchange.
492 Unless you want Perl's older "lax" mode, prefer
493 C<$octets = encode("UTF-8", $string)>.
495 Equivalent to C<$octets = encode("utf8", $string)>. The characters in
496 $string are encoded in Perl's internal format, and the result is returned
497 as a sequence of octets. Because all possible characters in Perl have a
498 (loose, not strict) utf8 representation, this function cannot fail.
502 $string = decode_utf8($octets [, CHECK]);
504 B<WARNING>: L<This function accepts invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
505 Do not use it for data exchange.
506 Unless you want Perl's older "lax" mode, prefer
507 C<$string = decode("UTF-8", $octets [, CHECK])>.
509 Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
510 The sequence of octets represented by $octets is decoded
511 from (loose, not strict) utf8 into a sequence of logical characters.
512 Because not all sequences of octets are valid not strict utf8,
513 it is quite possible for this function to fail.
514 For CHECK, see L</"Handling Malformed Data">.
516 B<CAVEAT>: the input I<$octets> might be modified in-place depending on
517 what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
520 =head2 Listing available encodings
523 @list = Encode->encodings();
525 Returns a list of canonical names of available encodings that have already
526 been loaded. To get a list of all available encodings including those that
527 have not yet been loaded, say:
529 @all_encodings = Encode->encodings(":all");
531 Or you can give the name of a specific module:
533 @with_jp = Encode->encodings("Encode::JP");
535 When "C<::>" is not in the name, "C<Encode::>" is assumed.
537 @ebcdic = Encode->encodings("EBCDIC");
539 To find out in detail which encodings are supported by this package,
540 see L<Encode::Supported>.
542 =head2 Defining Aliases
544 To add a new alias to a given encoding, use:
548 define_alias(NEWNAME => ENCODING);
550 After that, I<NEWNAME> can be used as an alias for I<ENCODING>.
551 I<ENCODING> may be either the name of an encoding or an
554 Before you do that, first make sure the alias is nonexistent using
555 C<resolve_alias()>, which returns the canonical name thereof.
558 Encode::resolve_alias("latin1") eq "iso-8859-1" # true
559 Encode::resolve_alias("iso-8859-12") # false; nonexistent
560 Encode::resolve_alias($name) eq $name # true if $name is canonical
562 C<resolve_alias()> does not need C<use Encode::Alias>; it can be
563 imported via C<use Encode qw(resolve_alias)>.
565 See L<Encode::Alias> for details.
567 =head2 Finding IANA Character Set Registry names
569 The canonical name of a given encoding does not necessarily agree with
570 IANA Character Set Registry, commonly seen as C<< Content-Type:
571 text/plain; charset=I<WHATEVER> >>. For most cases, the canonical name
572 works, but sometimes it does not, most notably with "utf-8-strict".
574 As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
577 my $enc = find_encoding("UTF-8");
578 warn $enc->name; # utf-8-strict
579 warn $enc->mime_name; # UTF-8
581 See also: L<Encode::Encoding>
583 =head1 Encoding via PerlIO
585 If your perl supports C<PerlIO> (which is the default), you can use a
586 C<PerlIO> layer to decode and encode directly via a filehandle. The
587 following two examples are fully identical in functionality:
589 ### Version 1 via PerlIO
590 open(INPUT, "< :encoding(shiftjis)", $infile)
591 || die "Can't open < $infile for reading: $!";
592 open(OUTPUT, "> :encoding(euc-jp)", $outfile)
593 || die "Can't open > $output for writing: $!";
594 while (<INPUT>) { # auto decodes $_
595 print OUTPUT; # auto encodes $_
597 close(INPUT) || die "can't close $infile: $!";
598 close(OUTPUT) || die "can't close $outfile: $!";
600 ### Version 2 via from_to()
601 open(INPUT, "< :raw", $infile)
602 || die "Can't open < $infile for reading: $!";
603 open(OUTPUT, "> :raw", $outfile)
604 || die "Can't open > $output for writing: $!";
607 from_to($_, "shiftjis", "euc-jp", 1); # switch encoding
608 print OUTPUT; # emit raw (but properly encoded) data
610 close(INPUT) || die "can't close $infile: $!";
611 close(OUTPUT) || die "can't close $outfile: $!";
613 In the first version above, you let the appropriate encoding layer
614 handle the conversion. In the second, you explicitly translate
615 from one encoding to the other.
617 Unfortunately, it may be that encodings are not C<PerlIO>-savvy. You can check
618 to see whether your encoding is supported by C<PerlIO> by invoking the
619 C<perlio_ok> method on it:
621 Encode::perlio_ok("hz"); # false
622 find_encoding("euc-cn")->perlio_ok; # true wherever PerlIO is available
624 use Encode qw(perlio_ok); # imported upon request
627 Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
628 except for C<hz> and C<ISO-2022-kr>. For the gory details, see
629 L<Encode::Encoding> and L<Encode::PerlIO>.
631 =head1 Handling Malformed Data
633 The optional I<CHECK> argument tells C<Encode> what to do when
634 encountering malformed data. Without I<CHECK>, C<Encode::FB_DEFAULT>
637 As of version 2.12, C<Encode> supports coderef values for C<CHECK>;
640 B<NOTE:> Not all encodings support this feature.
641 Some encodings ignore the I<CHECK> argument. For example,
642 L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
644 =head2 List of I<CHECK> values
648 I<CHECK> = Encode::FB_DEFAULT ( == 0)
650 If I<CHECK> is 0, encoding and decoding replace any malformed character
651 with a I<substitution character>. When you encode, I<SUBCHAR> is used.
652 When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
653 used. If the data is supposed to be UTF-8, an optional lexical warning of
654 warning category C<"utf8"> is given.
658 I<CHECK> = Encode::FB_CROAK ( == 1)
660 If I<CHECK> is 1, methods immediately die with an error
661 message. Therefore, when I<CHECK> is 1, you should trap
662 exceptions with C<eval{}>, unless you really want to let it C<die>.
666 I<CHECK> = Encode::FB_QUIET
668 If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
669 return the portion of the data that has been processed so far when an
670 error occurs. The data argument is overwritten with everything
671 after that point; that is, the unprocessed portion of the data. This is
672 handy when you have to call C<decode> repeatedly in the case where your
673 source data may contain partial multi-byte character sequences,
674 (that is, you are reading with a fixed-width buffer). Here's some sample
675 code to do exactly that:
677 my($buffer, $string) = ("", "");
678 while (read($fh, $buffer, 256, length($buffer))) {
679 $string .= decode($encoding, $buffer, Encode::FB_QUIET);
680 # $buffer now contains the unprocessed partial character
685 I<CHECK> = Encode::FB_WARN
687 This is the same as C<FB_QUIET> above, except that instead of being silent
688 on errors, it issues a warning. This is handy for when you are debugging.
690 B<CAVEAT>: All warnings from Encode module are reported, independently of
691 L<pragma warnings|warnings> settings. If you want to follow settings of
692 lexical warnings configured by L<pragma warnings|warnings> then append
693 also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available
694 since Encode version 2.99.
696 =head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
700 =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
702 =item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
704 =item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
708 For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
709 C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
711 When you decode, C<\xI<HH>> is inserted for a malformed character, where
712 I<HH> is the hex representation of the octet that could not be decoded to
713 utf8. When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
714 the Unicode code point (in any number of hex digits) of the character that
715 cannot be found in the character repertoire of the encoding.
717 The HTML/XML character reference modes are about the same. In place of
718 C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
719 XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
721 In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
725 These modes are all actually set via a bitmask. Here is how the C<FB_I<XXX>>
726 constants are laid out. You can import the C<FB_I<XXX>> constants via
727 C<use Encode qw(:fallbacks)>, and you can import the generic bitmask
728 constants via C<use Encode qw(:fallback_all)>.
730 FB_DEFAULT FB_CROAK FB_QUIET FB_WARN FB_PERLQQ
733 RETURN_ON_ERR 0x0004 X X
743 If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
744 source string to encode() or decode() will be overwritten in place.
745 If you're not interested in this, then bitwise-OR it with the bitmask.
747 =head2 coderef for CHECK
749 As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
750 ordinal value of the unmapped character as an argument and returns
751 octets that represent the fallback character. For instance:
753 $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
755 Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
757 Fallback for C<decode> must return decoded string (sequence of characters)
758 and takes a list of ordinal values as its arguments. So for
759 example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
760 a fallback for bytes that are not valid UTF-8, you could write
762 $str = decode 'UTF-8', $octets, sub {
763 my $tmp = join '', map chr, @_;
764 return decode 'ISO-8859-15', $tmp;
767 =head1 Defining Encodings
769 To define a new encoding, use:
771 use Encode qw(define_encoding);
772 define_encoding($object, CANONICAL_NAME [, alias...]);
774 I<CANONICAL_NAME> will be associated with I<$object>. The object
775 should provide the interface described in L<Encode::Encoding>.
776 If more than two arguments are provided, additional
777 arguments are considered aliases for I<$object>.
779 See L<Encode::Encoding> for details.
783 Before the introduction of Unicode support in Perl, The C<eq> operator
784 just compared the strings represented by two scalars. Beginning with
785 Perl 5.8, C<eq> compares two strings with simultaneous consideration of
786 I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
787 I<Programming Perl, 3rd ed.>
793 Old byte-oriented programs should not spontaneously break on the old
794 byte-oriented data they used to work on.
798 Old byte-oriented programs should magically start working on the new
799 character-oriented data when appropriate.
803 Programs should run just as fast in the new character-oriented mode
804 as in the old byte-oriented mode.
808 Perl should remain one language, rather than forking into a
809 byte-oriented Perl and a character-oriented Perl.
813 When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
814 born yet, many features documented in the book remained unimplemented for a
815 long time. Perl 5.8 corrected much of this, and the introduction of the
816 UTF8 flag is one of them. You can think of there being two fundamentally
817 different kinds of strings and string-operations in Perl: one a
818 byte-oriented mode for when the internal UTF8 flag is off, and the other a
819 character-oriented mode for when the internal UTF8 flag is on.
821 This UTF8 flag is not visible in Perl scripts, exactly for the same reason
822 you cannot (or rather, you I<don't have to>) see whether a scalar contains
823 a string, an integer, or a floating-point number. But you can still peek
824 and poke these if you will. See the next section.
826 =head2 Messing with Perl's Internals
828 The following API uses parts of Perl's internals in the current
829 implementation. As such, they are efficient but may change in a future
834 is_utf8(STRING [, CHECK])
836 [INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
837 If I<CHECK> is true, also checks whether I<STRING> contains well-formed
838 UTF-8. Returns true if successful, false otherwise.
840 Typically only necessary for debugging and testing. Don't use this flag as
841 a marker to distinguish character and binary data, that should be decided
842 for each variable when you write your code.
844 B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that
845 I<STRING> is UTF-8 encoded and vice-versa.
847 As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
853 [INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>. The I<STRING>
854 is I<not> checked for containing only well-formed UTF-8. Do not use this
855 unless you I<know with absolute certainty> that the STRING holds only
856 well-formed UTF-8. Returns the previous state of the UTF8 flag (so please
857 don't treat the return value as indicating success or failure), or C<undef>
858 if I<STRING> is not a string.
860 B<NOTE>: For security reasons, this function does not work on tainted values.
866 [INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>. Do not use
867 frivolously. Returns the previous state of the UTF8 flag, or C<undef> if
868 I<STRING> is not a string. Do not treat the return value as indicative of
869 success or failure, because that isn't what it means: it is only the
872 B<NOTE>: For security reasons, this function does not work on tainted values.
874 =head1 UTF-8 vs. utf8 vs. UTF8
876 ....We now view strings not as sequences of bytes, but as sequences
877 of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
878 computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
880 That has historically been Perl's notion of UTF-8, as that is how UTF-8 was
881 first conceived by Ken Thompson when he invented it. However, thanks to
882 later revisions to the applicable standards, official UTF-8 is now rather
883 stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
884 to cover only 21 bits instead of 32 or 64 bits) and some sequences
885 are not allowed, like those used in surrogate pairs, the 31 non-character
886 code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
887 (0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
889 The former default in which Perl would always use a loose interpretation of
890 UTF-8 has now been overruled:
892 From: Larry Wall <larry@wall.org>
893 Date: December 04, 2004 11:51:58 JST
894 To: perl-unicode@perl.org
895 Subject: Re: Make Encode.pm support the real UTF-8
896 Message-Id: <20041204025158.GA28754@wall.org>
898 On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
899 : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
900 : but "UTF-8" is the name of the standard and should give the
901 : corresponding behaviour.
903 For what it's worth, that's how I've always kept them straight in my
906 Also for what it's worth, Perl 6 will mostly default to strict but
907 make it easy to switch back to lax.
911 Got that? As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
912 sense, which is conservative and strict and security-conscious, whereas
913 B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
914 lax. C<Encode> version 2.10 or later thus groks this subtle but critically
915 important distinction between C<"UTF-8"> and C<"utf8">.
917 encode("utf8", "\x{FFFF_FFFF}", 1); # okay
918 encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
920 This distinction is also important for decoding. In the following,
921 C<$s> stores character U+200000, which exceeds UTF-8's allowed range.
922 C<$s> thus stores an invalid Unicode code point:
924 $s = decode("utf8", "\xf8\x88\x80\x80\x80");
926 C<"UTF-8">, by contrast, will either coerce the input to something valid:
928 $s = decode("UTF-8", "\xf8\x88\x80\x80\x80"); # U+FFFD
932 decode("UTF-8", "\xf8\x88\x80\x80\x80", FB_CROAK|LEAVE_SRC);
934 In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
935 C<"utf-8-strict">. That hyphen between the C<"UTF"> and the C<"8"> is
936 critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
938 find_encoding("UTF-8")->name # is 'utf-8-strict'
939 find_encoding("utf-8")->name # ditto. names are case insensitive
940 find_encoding("utf_8")->name # ditto. "_" are treated as "-"
941 find_encoding("UTF8")->name # is 'utf8'.
943 Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
944 whether a string is internally encoded as "utf8", also without a hyphen.
949 L<Encode::Supported>,
954 L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
956 the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
960 This project was originated by the late Nick Ing-Simmons and later
961 maintained by Dan Kogai I<< <dankogai@cpan.org> >>. See AUTHORS
962 for a full list of people involved. For any questions, send mail to
963 I<< <perl-unicode@perl.org> >> so that we can all share.
965 While Dan Kogai retains the copyright as a maintainer, credit
966 should go to all those involved. See AUTHORS for a list of those
967 who submitted code to the project.
971 Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>.
973 This library is free software; you can redistribute it and/or modify
974 it under the same terms as Perl itself.