cpan/Encode/Encode.pm

   1 #
   2 # $Id: Encode.pm,v 3.18 2022/06/25 02:04:06 dankogai Exp $
   3 #
   4 package Encode;
   5 use strict;
   6 use warnings;
   7 use constant DEBUG => !!$ENV{PERL_ENCODE_DEBUG};
   8 our $VERSION;
   9 BEGIN {
  10     $VERSION = sprintf "%d.%02d", q$Revision: 3.18 $ =~ /(\d+)/g;
  11     require XSLoader;
  12     XSLoader::load( __PACKAGE__, $VERSION );
  13 }
  14
  15 use Exporter 5.57 'import';
  16
  17 use Carp ();
  18 our @CARP_NOT = qw(Encode::Encoder);
  19
  20 # Public, encouraged API is exported by default
  21
  22 our @EXPORT = qw(
  23   decode  decode_utf8  encode  encode_utf8 str2bytes bytes2str
  24   encodings  find_encoding find_mime_encoding clone_encoding
  25 );
  26 our @FB_FLAGS = qw(
  27   DIE_ON_ERR WARN_ON_ERR RETURN_ON_ERR LEAVE_SRC
  28   PERLQQ HTMLCREF XMLCREF STOP_AT_PARTIAL
  29 );
  30 our @FB_CONSTS = qw(
  31   FB_DEFAULT FB_CROAK FB_QUIET FB_WARN
  32   FB_PERLQQ FB_HTMLCREF FB_XMLCREF
  33 );
  34 our @EXPORT_OK = (
  35     qw(
  36       _utf8_off _utf8_on define_encoding from_to is_16bit is_8bit
  37       is_utf8 perlio_ok resolve_alias utf8_downgrade utf8_upgrade
  38       ),
  39     @FB_FLAGS, @FB_CONSTS,
  40 );
  41
  42 our %EXPORT_TAGS = (
  43     all          => [ @EXPORT,    @EXPORT_OK ],
  44     default      => [ @EXPORT ],
  45     fallbacks    => [ @FB_CONSTS ],
  46     fallback_all => [ @FB_CONSTS, @FB_FLAGS ],
  47 );
  48
  49 # Documentation moved after __END__ for speed - NI-S
  50
  51 our $ON_EBCDIC = ( ord("A") == 193 );
  52
  53 use Encode::Alias ();
  54 use Encode::MIME::Name;
  55
  56 use Storable;
  57
  58 # Make a %Encoding package variable to allow a certain amount of cheating
  59 our %Encoding;
  60 our %ExtModule;
  61 require Encode::Config;
  62 #  See
  63 #  https://bugzilla.redhat.com/show_bug.cgi?id=435505#c2
  64 #  to find why sig handlers inside eval{} are disabled.
  65 eval {
  66     local $SIG{__DIE__};
  67     local $SIG{__WARN__};
  68     local @INC = @INC;
  69     pop @INC if @INC && $INC[-1] eq '.';
  70     require Encode::ConfigLocal;
  71 };
  72
  73 sub encodings {
  74     my %enc;
  75     my $arg  = $_[1] || '';
  76     if ( $arg eq ":all" ) {
  77         %enc = ( %Encoding, %ExtModule );
  78     }
  79     else {
  80         %enc = %Encoding;
  81         for my $mod ( map { m/::/ ? $_ : "Encode::$_" } @_ ) {
  82             DEBUG and warn $mod;
  83             for my $enc ( keys %ExtModule ) {
  84                 $ExtModule{$enc} eq $mod and $enc{$enc} = $mod;
  85             }
  86         }
  87     }
  88     return sort { lc $a cmp lc $b }
  89       grep      { !/^(?:Internal|Unicode|Guess)$/o } keys %enc;
  90 }
  91
  92 sub perlio_ok {
  93     my $obj = ref( $_[0] ) ? $_[0] : find_encoding( $_[0] );
  94     $obj->can("perlio_ok") and return $obj->perlio_ok();
  95     return 0;    # safety net
  96 }
  97
  98 sub define_encoding {
  99     my $obj  = shift;
 100     my $name = shift;
 101     $Encoding{$name} = $obj;
 102     my $lc = lc($name);
 103     define_alias( $lc => $obj ) unless $lc eq $name;
 104     while (@_) {
 105         my $alias = shift;
 106         define_alias( $alias, $obj );
 107     }
 108     my $class = ref($obj);
 109     push @Encode::CARP_NOT, $class unless grep { $_ eq $class } @Encode::CARP_NOT;
 110     push @Encode::Encoding::CARP_NOT, $class unless grep { $_ eq $class } @Encode::Encoding::CARP_NOT;
 111     return $obj;
 112 }
 113
 114 sub getEncoding {
 115     my ( $class, $name, $skip_external ) = @_;
 116
 117     defined($name) or return;
 118
 119     $name =~ s/\s+//g; # https://rt.cpan.org/Ticket/Display.html?id=65796
 120
 121     ref($name) && $name->can('renew') and return $name;
 122     exists $Encoding{$name} and return $Encoding{$name};
 123     my $lc = lc $name;
 124     exists $Encoding{$lc} and return $Encoding{$lc};
 125
 126     my $oc = $class->find_alias($name);
 127     defined($oc) and return $oc;
 128     $lc ne $name and $oc = $class->find_alias($lc);
 129     defined($oc) and return $oc;
 130
 131     unless ($skip_external) {
 132         if ( my $mod = $ExtModule{$name} || $ExtModule{$lc} ) {
 133             $mod =~ s,::,/,g;
 134             $mod .= '.pm';
 135             eval { require $mod; };
 136             exists $Encoding{$name} and return $Encoding{$name};
 137         }
 138     }
 139     return;
 140 }
 141
 142 # HACK: These two functions must be defined in Encode and because of
 143 # cyclic dependency between Encode and Encode::Alias, Exporter does not work
 144 sub find_alias {
 145     goto &Encode::Alias::find_alias;
 146 }
 147 sub define_alias {
 148     goto &Encode::Alias::define_alias;
 149 }
 150
 151 sub find_encoding($;$) {
 152     my ( $name, $skip_external ) = @_;
 153     return __PACKAGE__->getEncoding( $name, $skip_external );
 154 }
 155
 156 sub find_mime_encoding($;$) {
 157     my ( $mime_name, $skip_external ) = @_;
 158     my $name = Encode::MIME::Name::get_encode_name( $mime_name );
 159     return find_encoding( $name, $skip_external );
 160 }
 161
 162 sub resolve_alias($) {
 163     my $obj = find_encoding(shift);
 164     defined $obj and return $obj->name;
 165     return;
 166 }
 167
 168 sub clone_encoding($) {
 169     my $obj = find_encoding(shift);
 170     ref $obj or return;
 171     return Storable::dclone($obj);
 172 }
 173
 174 onBOOT;
 175
 176 if ($ON_EBCDIC) {
 177     package Encode::UTF_EBCDIC;
 178     use parent 'Encode::Encoding';
 179     my $obj = bless { Name => "UTF_EBCDIC" } => "Encode::UTF_EBCDIC";
 180     Encode::define_encoding($obj, 'Unicode');
 181     sub decode {
 182         my ( undef, $str, $chk ) = @_;
 183         my $res = '';
 184         for ( my $i = 0 ; $i < length($str) ; $i++ ) {
 185             $res .=
 186               chr(
 187                 utf8::unicode_to_native( ord( substr( $str, $i, 1 ) ) )
 188               );
 189         }
 190         $_[1] = '' if $chk;
 191         return $res;
 192     }
 193     sub encode {
 194         my ( undef, $str, $chk ) = @_;
 195         my $res = '';
 196         for ( my $i = 0 ; $i < length($str) ; $i++ ) {
 197             $res .=
 198               chr(
 199                 utf8::native_to_unicode( ord( substr( $str, $i, 1 ) ) )
 200               );
 201         }
 202         $_[1] = '' if $chk;
 203         return $res;
 204     }
 205 }
 206
 207 {
 208     # https://rt.cpan.org/Public/Bug/Display.html?id=103253
 209     package Encode::XS;
 210     use parent 'Encode::Encoding';
 211 }
 212
 213 {
 214     package Encode::utf8;
 215     use parent 'Encode::Encoding';
 216     my %obj = (
 217         'utf8'         => { Name => 'utf8' },
 218         'utf-8-strict' => { Name => 'utf-8-strict', strict_utf8 => 1 }
 219     );
 220     for ( keys %obj ) {
 221         bless $obj{$_} => __PACKAGE__;
 222         Encode::define_encoding( $obj{$_} => $_ );
 223     }
 224     sub cat_decode {
 225         # ($obj, $dst, $src, $pos, $trm, $chk)
 226         # currently ignores $chk
 227         my ( undef, undef, undef, $pos, $trm ) = @_;
 228         my ( $rdst, $rsrc, $rpos ) = \@_[ 1, 2, 3 ];
 229         use bytes;
 230         if ( ( my $npos = index( $$rsrc, $trm, $pos ) ) >= 0 ) {
 231             $$rdst .=
 232               substr( $$rsrc, $pos, $npos - $pos + length($trm) );
 233             $$rpos = $npos + length($trm);
 234             return 1;
 235         }
 236         $$rdst .= substr( $$rsrc, $pos );
 237         $$rpos = length($$rsrc);
 238         return '';
 239     }
 240 }
 241
 242 1;
 243
 244 __END__
 245
 246 =head1 NAME
 247
 248 Encode - character encodings in Perl
 249
 250 =head1 SYNOPSIS
 251
 252     use Encode qw(decode encode);
 253     $characters = decode('UTF-8', $octets,     Encode::FB_CROAK);
 254     $octets     = encode('UTF-8', $characters, Encode::FB_CROAK);
 255
 256 =head2 Table of Contents
 257
 258 Encode consists of a collection of modules whose details are too extensive
 259 to fit in one document.  This one itself explains the top-level APIs
 260 and general topics at a glance.  For other topics and more details,
 261 see the documentation for these modules:
 262
 263 =over 2
 264
 265 =item L<Encode::Alias> - Alias definitions to encodings
 266
 267 =item L<Encode::Encoding> - Encode Implementation Base Class
 268
 269 =item L<Encode::Supported> - List of Supported Encodings
 270
 271 =item L<Encode::CN> - Simplified Chinese Encodings
 272
 273 =item L<Encode::JP> - Japanese Encodings
 274
 275 =item L<Encode::KR> - Korean Encodings
 276
 277 =item L<Encode::TW> - Traditional Chinese Encodings
 278
 279 =back
 280
 281 =head1 DESCRIPTION
 282
 283 The C<Encode> module provides the interface between Perl strings
 284 and the rest of the system.  Perl strings are sequences of
 285 I<characters>.
 286
 287 The repertoire of characters that Perl can represent is a superset of those
 288 defined by the Unicode Consortium. On most platforms the ordinal
 289 values of a character as returned by C<ord(I<S>)> is the I<Unicode
 290 codepoint> for that character. The exceptions are platforms where
 291 the legacy encoding is some variant of EBCDIC rather than a superset
 292 of ASCII; see L<perlebcdic>.
 293
 294 During recent history, data is moved around a computer in 8-bit chunks,
 295 often called "bytes" but also known as "octets" in standards documents.
 296 Perl is widely used to manipulate data of many types: not only strings of
 297 characters representing human or computer languages, but also "binary"
 298 data, being the machine's representation of numbers, pixels in an image, or
 299 just about anything.
 300
 301 When Perl is processing "binary data", the programmer wants Perl to
 302 process "sequences of bytes". This is not a problem for Perl: because a
 303 byte has 256 possible values, it easily fits in Perl's much larger
 304 "logical character".
 305
 306 This document mostly explains the I<how>. L<perlunitut> and L<perlunifaq>
 307 explain the I<why>.
 308
 309 =head2 TERMINOLOGY
 310
 311 =head3 character
 312
 313 A character in the range 0 .. 2**32-1 (or more);
 314 what Perl's strings are made of.
 315
 316 =head3 byte
 317
 318 A character in the range 0..255;
 319 a special case of a Perl character.
 320
 321 =head3 octet
 322
 323 8 bits of data, with ordinal values 0..255;
 324 term for bytes passed to or from a non-Perl context, such as a disk file,
 325 standard I/O stream, database, command-line argument, environment variable,
 326 socket etc.
 327
 328 =head1 THE PERL ENCODING API
 329
 330 =head2 Basic methods
 331
 332 =head3 encode
 333
 334   $octets  = encode(ENCODING, STRING[, CHECK])
 335
 336 Encodes the scalar value I<STRING> from Perl's internal form into
 337 I<ENCODING> and returns a sequence of octets.  I<ENCODING> can be either a
 338 canonical name or an alias.  For encoding names and aliases, see
 339 L</"Defining Aliases">.  For CHECK, see L</"Handling Malformed Data">.
 340
 341 B<CAVEAT>: the input scalar I<STRING> might be modified in-place depending
 342 on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
 343 left unchanged.
 344
 345 For example, to convert a string from Perl's internal format into
 346 ISO-8859-1, also known as Latin1:
 347
 348   $octets = encode("iso-8859-1", $string);
 349
 350 B<CAVEAT>: When you run C<$octets = encode("UTF-8", $string)>, then
 351 $octets I<might not be equal to> $string.  Though both contain the
 352 same data, the UTF8 flag for $octets is I<always> off.  When you
 353 encode anything, the UTF8 flag on the result is always off, even when it
 354 contains a completely valid UTF-8 string. See L</"The UTF8 flag"> below.
 355
 356 If the $string is C<undef>, then C<undef> is returned.
 357
 358 C<str2bytes> may be used as an alias for C<encode>.
 359
 360 =head3 decode
 361
 362   $string = decode(ENCODING, OCTETS[, CHECK])
 363
 364 This function returns the string that results from decoding the scalar
 365 value I<OCTETS>, assumed to be a sequence of octets in I<ENCODING>, into
 366 Perl's internal form.  As with encode(),
 367 I<ENCODING> can be either a canonical name or an alias. For encoding names
 368 and aliases, see L</"Defining Aliases">; for I<CHECK>, see L</"Handling
 369 Malformed Data">.
 370
 371 B<CAVEAT>: the input scalar I<OCTETS> might be modified in-place depending
 372 on what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
 373 left unchanged.
 374
 375 For example, to convert ISO-8859-1 data into a string in Perl's
 376 internal format:
 377
 378   $string = decode("iso-8859-1", $octets);
 379
 380 B<CAVEAT>: When you run C<$string = decode("UTF-8", $octets)>, then $string
 381 I<might not be equal to> $octets.  Though both contain the same data, the
 382 UTF8 flag for $string is on.  See L</"The UTF8 flag">
 383 below.
 384
 385 If the $string is C<undef>, then C<undef> is returned.
 386
 387 C<bytes2str> may be used as an alias for C<decode>.
 388
 389 =head3 find_encoding
 390
 391   [$obj =] find_encoding(ENCODING)
 392
 393 Returns the I<encoding object> corresponding to I<ENCODING>.  Returns
 394 C<undef> if no matching I<ENCODING> is find.  The returned object is
 395 what does the actual encoding or decoding.
 396
 397   $string = decode($name, $bytes);
 398
 399 is in fact
 400
 401     $string = do {
 402         $obj = find_encoding($name);
 403         croak qq(encoding "$name" not found) unless ref $obj;
 404         $obj->decode($bytes);
 405     };
 406
 407 with more error checking.
 408
 409 You can therefore save time by reusing this object as follows;
 410
 411     my $enc = find_encoding("iso-8859-1");
 412     while(<>) {
 413         my $string = $enc->decode($_);
 414         ... # now do something with $string;
 415     }
 416
 417 Besides L</decode> and L</encode>, other methods are
 418 available as well.  For instance, C<name()> returns the canonical
 419 name of the encoding object.
 420
 421   find_encoding("latin1")->name; # iso-8859-1
 422
 423 See L<Encode::Encoding> for details.
 424
 425 =head3 find_mime_encoding
 426
 427   [$obj =] find_mime_encoding(MIME_ENCODING)
 428
 429 Returns the I<encoding object> corresponding to I<MIME_ENCODING>.  Acts
 430 same as C<find_encoding()> but C<mime_name()> of returned object must
 431 match to I<MIME_ENCODING>.  So as opposite of C<find_encoding()>
 432 canonical names and aliases are not used when searching for object.
 433
 434     find_mime_encoding("utf8"); # returns undef because "utf8" is not valid I<MIME_ENCODING>
 435     find_mime_encoding("utf-8"); # returns encode object "utf-8-strict"
 436     find_mime_encoding("UTF-8"); # same as "utf-8" because I<MIME_ENCODING> is case insensitive
 437     find_mime_encoding("utf-8-strict"); returns undef because "utf-8-strict" is not valid I<MIME_ENCODING>
 438
 439 =head3 from_to
 440
 441   [$length =] from_to($octets, FROM_ENC, TO_ENC [, CHECK])
 442
 443 Converts I<in-place> data between two encodings. The data in $octets
 444 must be encoded as octets and I<not> as characters in Perl's internal
 445 format. For example, to convert ISO-8859-1 data into Microsoft's CP1250
 446 encoding:
 447
 448   from_to($octets, "iso-8859-1", "cp1250");
 449
 450 and to convert it back:
 451
 452   from_to($octets, "cp1250", "iso-8859-1");
 453
 454 Because the conversion happens in place, the data to be
 455 converted cannot be a string constant: it must be a scalar variable.
 456
 457 C<from_to()> returns the length of the converted string in octets on success,
 458 and C<undef> on error.
 459
 460 B<CAVEAT>: The following operations may look the same, but are not:
 461
 462   from_to($data, "iso-8859-1", "UTF-8"); #1
 463   $data = decode("iso-8859-1", $data);  #2
 464
 465 Both #1 and #2 make $data consist of a completely valid UTF-8 string,
 466 but only #2 turns the UTF8 flag on.  #1 is equivalent to:
 467
 468   $data = encode("UTF-8", decode("iso-8859-1", $data));
 469
 470 See L</"The UTF8 flag"> below.
 471
 472 Also note that:
 473
 474   from_to($octets, $from, $to, $check);
 475
 476 is equivalent to:
 477
 478   $octets = encode($to, decode($from, $octets), $check);
 479
 480 Yes, it does I<not> respect the $check during decoding.  It is
 481 deliberately done that way.  If you need minute control, use C<decode>
 482 followed by C<encode> as follows:
 483
 484   $octets = encode($to, decode($from, $octets, $check_from), $check_to);
 485
 486 =head3 encode_utf8
 487
 488   $octets = encode_utf8($string);
 489
 490 B<WARNING>: L<This function can produce invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
 491 Do not use it for data exchange.
 492 Unless you want Perl's older "lax" mode, prefer
 493 C<$octets = encode("UTF-8", $string)>.
 494
 495 Equivalent to C<$octets = encode("utf8", $string)>.  The characters in
 496 $string are encoded in Perl's internal format, and the result is returned
 497 as a sequence of octets.  Because all possible characters in Perl have a
 498 (loose, not strict) utf8 representation, this function cannot fail.
 499
 500 =head3 decode_utf8
 501
 502   $string = decode_utf8($octets [, CHECK]);
 503
 504 B<WARNING>: L<This function accepts invalid UTF-8!|/UTF-8 vs. utf8 vs. UTF8>
 505 Do not use it for data exchange.
 506 Unless you want Perl's older "lax" mode, prefer
 507 C<$string = decode("UTF-8", $octets [, CHECK])>.
 508
 509 Equivalent to C<$string = decode("utf8", $octets [, CHECK])>.
 510 The sequence of octets represented by $octets is decoded
 511 from (loose, not strict) utf8 into a sequence of logical characters.
 512 Because not all sequences of octets are valid not strict utf8,
 513 it is quite possible for this function to fail.
 514 For CHECK, see L</"Handling Malformed Data">.
 515
 516 B<CAVEAT>: the input I<$octets> might be modified in-place depending on
 517 what is set in CHECK. See L</LEAVE_SRC> if you want your inputs to be
 518 left unchanged.
 519
 520 =head2 Listing available encodings
 521
 522   use Encode;
 523   @list = Encode->encodings();
 524
 525 Returns a list of canonical names of available encodings that have already
 526 been loaded.  To get a list of all available encodings including those that
 527 have not yet been loaded, say:
 528
 529   @all_encodings = Encode->encodings(":all");
 530
 531 Or you can give the name of a specific module:
 532
 533   @with_jp = Encode->encodings("Encode::JP");
 534
 535 When "C<::>" is not in the name, "C<Encode::>" is assumed.
 536
 537   @ebcdic = Encode->encodings("EBCDIC");
 538
 539 To find out in detail which encodings are supported by this package,
 540 see L<Encode::Supported>.
 541
 542 =head2 Defining Aliases
 543
 544 To add a new alias to a given encoding, use:
 545
 546   use Encode;
 547   use Encode::Alias;
 548   define_alias(NEWNAME => ENCODING);
 549
 550 After that, I<NEWNAME> can be used as an alias for I<ENCODING>.
 551 I<ENCODING> may be either the name of an encoding or an
 552 I<encoding object>.
 553
 554 Before you do that, first make sure the alias is nonexistent using
 555 C<resolve_alias()>, which returns the canonical name thereof.
 556 For example:
 557
 558   Encode::resolve_alias("latin1") eq "iso-8859-1" # true
 559   Encode::resolve_alias("iso-8859-12")   # false; nonexistent
 560   Encode::resolve_alias($name) eq $name  # true if $name is canonical
 561
 562 C<resolve_alias()> does not need C<use Encode::Alias>; it can be
 563 imported via C<use Encode qw(resolve_alias)>.
 564
 565 See L<Encode::Alias> for details.
 566
 567 =head2 Finding IANA Character Set Registry names
 568
 569 The canonical name of a given encoding does not necessarily agree with
 570 IANA Character Set Registry, commonly seen as C<< Content-Type:
 571 text/plain; charset=I<WHATEVER> >>.  For most cases, the canonical name
 572 works, but sometimes it does not, most notably with "utf-8-strict".
 573
 574 As of C<Encode> version 2.21, a new method C<mime_name()> is therefore added.
 575
 576   use Encode;
 577   my $enc = find_encoding("UTF-8");
 578   warn $enc->name;      # utf-8-strict
 579   warn $enc->mime_name; # UTF-8
 580
 581 See also:  L<Encode::Encoding>
 582
 583 =head1 Encoding via PerlIO
 584
 585 If your perl supports C<PerlIO> (which is the default), you can use a
 586 C<PerlIO> layer to decode and encode directly via a filehandle.  The
 587 following two examples are fully identical in functionality:
 588
 589   ### Version 1 via PerlIO
 590     open(INPUT,  "< :encoding(shiftjis)", $infile)
 591         || die "Can't open < $infile for reading: $!";
 592     open(OUTPUT, "> :encoding(euc-jp)",  $outfile)
 593         || die "Can't open > $output for writing: $!";
 594     while (<INPUT>) {   # auto decodes $_
 595         print OUTPUT;   # auto encodes $_
 596     }
 597     close(INPUT)   || die "can't close $infile: $!";
 598     close(OUTPUT)  || die "can't close $outfile: $!";
 599
 600   ### Version 2 via from_to()
 601     open(INPUT,  "< :raw", $infile)
 602         || die "Can't open < $infile for reading: $!";
 603     open(OUTPUT, "> :raw",  $outfile)
 604         || die "Can't open > $output for writing: $!";
 605
 606     while (<INPUT>) {
 607         from_to($_, "shiftjis", "euc-jp", 1);  # switch encoding
 608         print OUTPUT;   # emit raw (but properly encoded) data
 609     }
 610     close(INPUT)   || die "can't close $infile: $!";
 611     close(OUTPUT)  || die "can't close $outfile: $!";
 612
 613 In the first version above, you let the appropriate encoding layer
 614 handle the conversion.  In the second, you explicitly translate
 615 from one encoding to the other.
 616
 617 Unfortunately, it may be that encodings are not C<PerlIO>-savvy.  You can check
 618 to see whether your encoding is supported by C<PerlIO> by invoking the
 619 C<perlio_ok> method on it:
 620
 621   Encode::perlio_ok("hz");             # false
 622   find_encoding("euc-cn")->perlio_ok;  # true wherever PerlIO is available
 623
 624   use Encode qw(perlio_ok);            # imported upon request
 625   perlio_ok("euc-jp")
 626
 627 Fortunately, all encodings that come with C<Encode> core are C<PerlIO>-savvy
 628 except for C<hz> and C<ISO-2022-kr>.  For the gory details, see
 629 L<Encode::Encoding> and L<Encode::PerlIO>.
 630
 631 =head1 Handling Malformed Data
 632
 633 The optional I<CHECK> argument tells C<Encode> what to do when
 634 encountering malformed data.  Without I<CHECK>, C<Encode::FB_DEFAULT>
 635 (== 0) is assumed.
 636
 637 As of version 2.12, C<Encode> supports coderef values for C<CHECK>;
 638 see below.
 639
 640 B<NOTE:> Not all encodings support this feature.
 641 Some encodings ignore the I<CHECK> argument.  For example,
 642 L<Encode::Unicode> ignores I<CHECK> and it always croaks on error.
 643
 644 =head2 List of I<CHECK> values
 645
 646 =head3 FB_DEFAULT
 647
 648   I<CHECK> = Encode::FB_DEFAULT ( == 0)
 649
 650 If I<CHECK> is 0, encoding and decoding replace any malformed character
 651 with a I<substitution character>.  When you encode, I<SUBCHAR> is used.
 652 When you decode, the Unicode REPLACEMENT CHARACTER, code point U+FFFD, is
 653 used.  If the data is supposed to be UTF-8, an optional lexical warning of
 654 warning category C<"utf8"> is given.
 655
 656 =head3 FB_CROAK
 657
 658   I<CHECK> = Encode::FB_CROAK ( == 1)
 659
 660 If I<CHECK> is 1, methods immediately die with an error
 661 message.  Therefore, when I<CHECK> is 1, you should trap
 662 exceptions with C<eval{}>, unless you really want to let it C<die>.
 663
 664 =head3 FB_QUIET
 665
 666   I<CHECK> = Encode::FB_QUIET
 667
 668 If I<CHECK> is set to C<Encode::FB_QUIET>, encoding and decoding immediately
 669 return the portion of the data that has been processed so far when an
 670 error occurs. The data argument is overwritten with everything
 671 after that point; that is, the unprocessed portion of the data.  This is
 672 handy when you have to call C<decode> repeatedly in the case where your
 673 source data may contain partial multi-byte character sequences,
 674 (that is, you are reading with a fixed-width buffer). Here's some sample
 675 code to do exactly that:
 676
 677     my($buffer, $string) = ("", "");
 678     while (read($fh, $buffer, 256, length($buffer))) {
 679         $string .= decode($encoding, $buffer, Encode::FB_QUIET);
 680         # $buffer now contains the unprocessed partial character
 681     }
 682
 683 =head3 FB_WARN
 684
 685   I<CHECK> = Encode::FB_WARN
 686
 687 This is the same as C<FB_QUIET> above, except that instead of being silent
 688 on errors, it issues a warning.  This is handy for when you are debugging.
 689
 690 B<CAVEAT>: All warnings from Encode module are reported, independently of
 691 L<pragma warnings|warnings> settings. If you want to follow settings of
 692 lexical warnings configured by L<pragma warnings|warnings> then append
 693 also check value C<ENCODE::ONLY_PRAGMA_WARNINGS>. This value is available
 694 since Encode version 2.99.
 695
 696 =head3 FB_PERLQQ FB_HTMLCREF FB_XMLCREF
 697
 698 =over 2
 699
 700 =item perlqq mode (I<CHECK> = Encode::FB_PERLQQ)
 701
 702 =item HTML charref mode (I<CHECK> = Encode::FB_HTMLCREF)
 703
 704 =item XML charref mode (I<CHECK> = Encode::FB_XMLCREF)
 705
 706 =back
 707
 708 For encodings that are implemented by the C<Encode::XS> module, C<CHECK> C<==>
 709 C<Encode::FB_PERLQQ> puts C<encode> and C<decode> into C<perlqq> fallback mode.
 710
 711 When you decode, C<\xI<HH>> is inserted for a malformed character, where
 712 I<HH> is the hex representation of the octet that could not be decoded to
 713 utf8.  When you encode, C<\x{I<HHHH>}> will be inserted, where I<HHHH> is
 714 the Unicode code point (in any number of hex digits) of the character that
 715 cannot be found in the character repertoire of the encoding.
 716
 717 The HTML/XML character reference modes are about the same. In place of
 718 C<\x{I<HHHH>}>, HTML uses C<&#I<NNN>;> where I<NNN> is a decimal number, and
 719 XML uses C<&#xI<HHHH>;> where I<HHHH> is the hexadecimal number.
 720
 721 In C<Encode> 2.10 or later, C<LEAVE_SRC> is also implied.
 722
 723 =head3 The bitmask
 724
 725 These modes are all actually set via a bitmask.  Here is how the C<FB_I<XXX>>
 726 constants are laid out.  You can import the C<FB_I<XXX>> constants via
 727 C<use Encode qw(:fallbacks)>, and you can import the generic bitmask
 728 constants via C<use Encode qw(:fallback_all)>.
 729
 730                      FB_DEFAULT FB_CROAK FB_QUIET FB_WARN  FB_PERLQQ
 731  DIE_ON_ERR    0x0001             X
 732  WARN_ON_ERR   0x0002                               X
 733  RETURN_ON_ERR 0x0004                      X        X
 734  LEAVE_SRC     0x0008                                        X
 735  PERLQQ        0x0100                                        X
 736  HTMLCREF      0x0200
 737  XMLCREF       0x0400
 738
 739 =head3 LEAVE_SRC
 740
 741   Encode::LEAVE_SRC
 742
 743 If the C<Encode::LEAVE_SRC> bit is I<not> set but I<CHECK> is set, then the
 744 source string to encode() or decode() will be overwritten in place.
 745 If you're not interested in this, then bitwise-OR it with the bitmask.
 746
 747 =head2 coderef for CHECK
 748
 749 As of C<Encode> 2.12, C<CHECK> can also be a code reference which takes the
 750 ordinal value of the unmapped character as an argument and returns
 751 octets that represent the fallback character.  For instance:
 752
 753   $ascii = encode("ascii", $utf8, sub{ sprintf "<U+%04X>", shift });
 754
 755 Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
 756
 757 Fallback for C<decode> must return decoded string (sequence of characters)
 758 and takes a list of ordinal values as its arguments. So for
 759 example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
 760 a fallback for bytes that are not valid UTF-8, you could write
 761
 762     $str = decode 'UTF-8', $octets, sub {
 763         my $tmp = join '', map chr, @_;
 764         return decode 'ISO-8859-15', $tmp;
 765     };
 766
 767 =head1 Defining Encodings
 768
 769 To define a new encoding, use:
 770
 771     use Encode qw(define_encoding);
 772     define_encoding($object, CANONICAL_NAME [, alias...]);
 773
 774 I<CANONICAL_NAME> will be associated with I<$object>.  The object
 775 should provide the interface described in L<Encode::Encoding>.
 776 If more than two arguments are provided, additional
 777 arguments are considered aliases for I<$object>.
 778
 779 See L<Encode::Encoding> for details.
 780
 781 =head1 The UTF8 flag
 782
 783 Before the introduction of Unicode support in Perl, The C<eq> operator
 784 just compared the strings represented by two scalars. Beginning with
 785 Perl 5.8, C<eq> compares two strings with simultaneous consideration of
 786 I<the UTF8 flag>. To explain why we made it so, I quote from page 402 of
 787 I<Programming Perl, 3rd ed.>
 788
 789 =over 2
 790
 791 =item Goal #1:
 792
 793 Old byte-oriented programs should not spontaneously break on the old
 794 byte-oriented data they used to work on.
 795
 796 =item Goal #2:
 797
 798 Old byte-oriented programs should magically start working on the new
 799 character-oriented data when appropriate.
 800
 801 =item Goal #3:
 802
 803 Programs should run just as fast in the new character-oriented mode
 804 as in the old byte-oriented mode.
 805
 806 =item Goal #4:
 807
 808 Perl should remain one language, rather than forking into a
 809 byte-oriented Perl and a character-oriented Perl.
 810
 811 =back
 812
 813 When I<Programming Perl, 3rd ed.> was written, not even Perl 5.6.0 had been
 814 born yet, many features documented in the book remained unimplemented for a
 815 long time.  Perl 5.8 corrected much of this, and the introduction of the
 816 UTF8 flag is one of them.  You can think of there being two fundamentally
 817 different kinds of strings and string-operations in Perl: one a
 818 byte-oriented mode  for when the internal UTF8 flag is off, and the other a
 819 character-oriented mode for when the internal UTF8 flag is on.
 820
 821 This UTF8 flag is not visible in Perl scripts, exactly for the same reason
 822 you cannot (or rather, you I<don't have to>) see whether a scalar contains
 823 a string, an integer, or a floating-point number.   But you can still peek
 824 and poke these if you will.  See the next section.
 825
 826 =head2 Messing with Perl's Internals
 827
 828 The following API uses parts of Perl's internals in the current
 829 implementation.  As such, they are efficient but may change in a future
 830 release.
 831
 832 =head3 is_utf8
 833
 834   is_utf8(STRING [, CHECK])
 835
 836 [INTERNAL] Tests whether the UTF8 flag is turned on in the I<STRING>.
 837 If I<CHECK> is true, also checks whether I<STRING> contains well-formed
 838 UTF-8.  Returns true if successful, false otherwise.
 839
 840 Typically only necessary for debugging and testing.  Don't use this flag as
 841 a marker to distinguish character and binary data, that should be decided
 842 for each variable when you write your code.
 843
 844 B<CAVEAT>: If I<STRING> has UTF8 flag set, it does B<NOT> mean that
 845 I<STRING> is UTF-8 encoded and vice-versa.
 846
 847 As of Perl 5.8.1, L<utf8> also has the C<utf8::is_utf8> function.
 848
 849 =head3 _utf8_on
 850
 851   _utf8_on(STRING)
 852
 853 [INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<on>.  The I<STRING>
 854 is I<not> checked for containing only well-formed UTF-8.  Do not use this
 855 unless you I<know with absolute certainty> that the STRING holds only
 856 well-formed UTF-8.  Returns the previous state of the UTF8 flag (so please
 857 don't treat the return value as indicating success or failure), or C<undef>
 858 if I<STRING> is not a string.
 859
 860 B<NOTE>: For security reasons, this function does not work on tainted values.
 861
 862 =head3 _utf8_off
 863
 864   _utf8_off(STRING)
 865
 866 [INTERNAL] Turns the I<STRING>'s internal UTF8 flag B<off>.  Do not use
 867 frivolously.  Returns the previous state of the UTF8 flag, or C<undef> if
 868 I<STRING> is not a string.  Do not treat the return value as indicative of
 869 success or failure, because that isn't what it means: it is only the
 870 previous setting.
 871
 872 B<NOTE>: For security reasons, this function does not work on tainted values.
 873
 874 =head1 UTF-8 vs. utf8 vs. UTF8
 875
 876   ....We now view strings not as sequences of bytes, but as sequences
 877   of numbers in the range 0 .. 2**32-1 (or in the case of 64-bit
 878   computers, 0 .. 2**64-1) -- Programming Perl, 3rd ed.
 879
 880 That has historically been Perl's notion of UTF-8, as that is how UTF-8 was
 881 first conceived by Ken Thompson when he invented it. However, thanks to
 882 later revisions to the applicable standards, official UTF-8 is now rather
 883 stricter than that. For example, its range is much narrower (0 .. 0x10_FFFF
 884 to cover only 21 bits instead of 32 or 64 bits) and some sequences
 885 are not allowed, like those used in surrogate pairs, the 31 non-character
 886 code points 0xFDD0 .. 0xFDEF, the last two code points in I<any> plane
 887 (0xI<XX>_FFFE and 0xI<XX>_FFFF), all non-shortest encodings, etc.
 888
 889 The former default in which Perl would always use a loose interpretation of
 890 UTF-8 has now been overruled:
 891
 892   From: Larry Wall <larry@wall.org>
 893   Date: December 04, 2004 11:51:58 JST
 894   To: perl-unicode@perl.org
 895   Subject: Re: Make Encode.pm support the real UTF-8
 896   Message-Id: <20041204025158.GA28754@wall.org>
 897
 898   On Fri, Dec 03, 2004 at 10:12:12PM +0000, Tim Bunce wrote:
 899   : I've no problem with 'utf8' being perl's unrestricted uft8 encoding,
 900   : but "UTF-8" is the name of the standard and should give the
 901   : corresponding behaviour.
 902
 903   For what it's worth, that's how I've always kept them straight in my
 904   head.
 905
 906   Also for what it's worth, Perl 6 will mostly default to strict but
 907   make it easy to switch back to lax.
 908
 909   Larry
 910
 911 Got that?  As of Perl 5.8.7, B<"UTF-8"> means UTF-8 in its current
 912 sense, which is conservative and strict and security-conscious, whereas
 913 B<"utf8"> means UTF-8 in its former sense, which was liberal and loose and
 914 lax.  C<Encode> version 2.10 or later thus groks this subtle but critically
 915 important distinction between C<"UTF-8"> and C<"utf8">.
 916
 917   encode("utf8",  "\x{FFFF_FFFF}", 1); # okay
 918   encode("UTF-8", "\x{FFFF_FFFF}", 1); # croaks
 919
 920 This distinction is also important for decoding. In the following,
 921 C<$s> stores character U+200000, which exceeds UTF-8's allowed range.
 922 C<$s> thus stores an invalid Unicode code point:
 923
 924   $s = decode("utf8", "\xf8\x88\x80\x80\x80");
 925
 926 C<"UTF-8">, by contrast, will either coerce the input to something valid:
 927
 928     $s = decode("UTF-8", "\xf8\x88\x80\x80\x80"); # U+FFFD
 929
 930 .. or croak:
 931
 932     decode("UTF-8", "\xf8\x88\x80\x80\x80", FB_CROAK|LEAVE_SRC);
 933
 934 In the C<Encode> module, C<"UTF-8"> is actually a canonical name for
 935 C<"utf-8-strict">.  That hyphen between the C<"UTF"> and the C<"8"> is
 936 critical; without it, C<Encode> goes "liberal" and (perhaps overly-)permissive:
 937
 938   find_encoding("UTF-8")->name # is 'utf-8-strict'
 939   find_encoding("utf-8")->name # ditto. names are case insensitive
 940   find_encoding("utf_8")->name # ditto. "_" are treated as "-"
 941   find_encoding("UTF8")->name  # is 'utf8'.
 942
 943 Perl's internal UTF8 flag is called "UTF8", without a hyphen. It indicates
 944 whether a string is internally encoded as "utf8", also without a hyphen.
 945
 946 =head1 SEE ALSO
 947
 948 L<Encode::Encoding>,
 949 L<Encode::Supported>,
 950 L<Encode::PerlIO>,
 951 L<encoding>,
 952 L<perlebcdic>,
 953 L<perlfunc/open>,
 954 L<perlunicode>, L<perluniintro>, L<perlunifaq>, L<perlunitut>
 955 L<utf8>,
 956 the Perl Unicode Mailing List L<http://lists.perl.org/list/perl-unicode.html>
 957
 958 =head1 MAINTAINER
 959
 960 This project was originated by the late Nick Ing-Simmons and later
 961 maintained by Dan Kogai I<< <dankogai@cpan.org> >>.  See AUTHORS
 962 for a full list of people involved.  For any questions, send mail to
 963 I<< <perl-unicode@perl.org> >> so that we can all share.
 964
 965 While Dan Kogai retains the copyright as a maintainer, credit
 966 should go to all those involved.  See AUTHORS for a list of those
 967 who submitted code to the project.
 968
 969 =head1 COPYRIGHT
 970
 971 Copyright 2002-2014 Dan Kogai I<< <dankogai@cpan.org> >>.
 972
 973 This library is free software; you can redistribute it and/or modify
 974 it under the same terms as Perl itself.
 975
 976 =cut