From c731e18ea9a2e5789568f97517c84f11d4759958 Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Mon, 8 Apr 2002 12:35:08 +0000 Subject: [PATCH] Upgrade to Encode 1.30, from Dan Kogai. p4raw-id: //depot/perl@15803 --- MANIFEST | 7 +- ext/Encode/Changes | 31 +++++- ext/Encode/Encode.pm | 141 +++++++++++++------------- ext/Encode/MANIFEST | 3 +- ext/Encode/lib/Encode/Alias.pm | 4 +- ext/Encode/lib/Encode/Encoder.pm | 195 ++++++++++++++++++++++++++++++++++++ ext/Encode/lib/Encode/Supported.pod | 28 ++++-- ext/Encode/lib/Encode/Unicode.pm | 7 +- ext/Encode/t/Unicode.t | 55 +++++----- ext/Encode/t/grow.t | 39 -------- 10 files changed, 358 insertions(+), 152 deletions(-) create mode 100644 ext/Encode/lib/Encode/Encoder.pm diff --git a/MANIFEST b/MANIFEST index fe6b030..a907be9 100644 --- a/MANIFEST +++ b/MANIFEST @@ -204,7 +204,7 @@ ext/Encode/CN/Makefile.PL Encode extension ext/Encode/Changes Change Log ext/Encode/EBCDIC/EBCDIC.pm Encode extension ext/Encode/EBCDIC/Makefile.PL Encode extension -ext/Encode/Encode.pm Encode extension +ext/Encode/Encode.pm Mother of all Encode extensions ext/Encode/Encode.xs Encode extension ext/Encode/Encode/Changes.e2x Skeleton file for enc2xs ext/Encode/Encode/Makefile_PL.e2x Skeleton file for enc2xs @@ -232,6 +232,7 @@ ext/Encode/encoding.pm Perl Pragmactic Module ext/Encode/lib/Encode/Alias.pm Encode extension ext/Encode/lib/Encode/CJKConstants.pm Encode extension ext/Encode/lib/Encode/CN/HZ.pm Encode extension +ext/Encode/lib/Encode/Encoder.pm OO Encoder ext/Encode/lib/Encode/Encoding.pm Encode extension ext/Encode/lib/Encode/JP/2022_JP.pm Encode extension ext/Encode/lib/Encode/JP/2022_JP1.pm Encode extension @@ -247,11 +248,11 @@ ext/Encode/t/Encode.t Encode extension test ext/Encode/t/JP.t Encode extension test ext/Encode/t/KR.t Encode extension test ext/Encode/t/TW.t Encode extension test -ext/Encode/t/Unicode.t Encode extension test +ext/Encode/t/Unicode.t Encode extension test ext/Encode/t/encoding.t encoding extension test ext/Encode/t/gb2312.euc test data ext/Encode/t/gb2312.ref test data -ext/Encode/t/grow.t Encode extension test +ext/Encode/t/grow.t Encode extension test ext/Encode/t/jisx0208.euc test data ext/Encode/t/jisx0208.ref test data ext/Encode/t/jisx0212.euc test data diff --git a/ext/Encode/Changes b/ext/Encode/Changes index ee5d260..145d8af 100644 --- a/ext/Encode/Changes +++ b/ext/Encode/Changes @@ -1,9 +1,34 @@ # Revision history for Perl extension Encode. # -# $Id: Changes,v 1.26 2002/04/07 15:22:04 dankogai Exp $ +# $Id: Changes,v 1.30 2002/04/08 02:34:51 dankogai Exp $ # -1.26 $Date: 2002/04/07 15:22:04 $ +1.30 $Date: 2002/04/08 02:34:51 $ ++ lib/Encode/Encoder.pm + Object Oriented Encoder. I reckon something like this is in need. +! Encode.pm +! t/Unicode.pm +! lib/Encode/Supported.pod + * autoloading but that prevented upper-case canonicals such as UTF-16 + is fixed. Now even UTF/UCS are autoloaded! + * encodings() is now more intuitive. + * t/Unicode.t fixed to explicitly use Unicode.pm -- BOM values are + stored therein. + * Obligatory fixes to the POD. +! lib/Encode/Supported.pod + Patch from Anton applied. + Message-Id: <66641479.20020408033300@motor.ru> +! Encode.pm +! lib/Encode/Unicode.pm + Cosmetic changes: "bless $obj, $class" => "bless $obj => class" + +1.28 2002/04/07 18:58:42 +! MANIFEST ++ t/Unicode.t ++ t/grow.t + Just a MANIFEST for those missing files. + +1.26 Date: 2002/04/07 15:22:04 ! JP/Makefile.PL ! t/Aliases.PL Schwarn's patches against Makefile.PL has zapped jis*.ucm. Restored. @@ -147,7 +172,7 @@ Typo fixes and improvements by jhi Message-Id: <200204010201.FAA03564@alpha.hut.fi>, et al. -1.11 $Date: 2002/04/07 15:22:04 $ +1.11 $Date: 2002/04/08 02:34:51 $ + t/encoding.t + t/jperl.t ! MANIFEST diff --git a/ext/Encode/Encode.pm b/ext/Encode/Encode.pm index b28acc5..caada11 100644 --- a/ext/Encode/Encode.pm +++ b/ext/Encode/Encode.pm @@ -1,6 +1,6 @@ package Encode; use strict; -our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.30 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; require DynaLoader; @@ -60,68 +60,73 @@ my @macintosh = qw( ); for my $k (2..11,13..16){ - $ExtModule{"iso-8859-$k"} = 'Encode/Byte.pm'; + $ExtModule{"iso-8859-$k"} = 'Encode::Byte'; } for my $k (@codepages){ - $ExtModule{"cp$k"} = 'Encode/Byte.pm'; + $ExtModule{"cp$k"} = 'Encode::Byte'; } for my $k (@macintosh) { - $ExtModule{"mac$k"} = 'Encode/Byte.pm'; + $ExtModule{"mac$k"} = 'Encode::Byte'; +} + +for my $k (qw(UCS-2BE UCS-2LE UTF-16 UTF-16BE UTF-16LE + UTF-32 UTF-32BE UTF-32LE)){ + $ExtModule{$k} = 'Encode::Unicode'; } %ExtModule = (%ExtModule, - 'koi8-r' => 'Encode/Byte.pm', - 'posix-bc' => 'Encode/EBCDIC.pm', - cp037 => 'Encode/EBCDIC.pm', - cp1026 => 'Encode/EBCDIC.pm', - cp1047 => 'Encode/EBCDIC.pm', - cp500 => 'Encode/EBCDIC.pm', - cp875 => 'Encode/EBCDIC.pm', - dingbats => 'Encode/Symbol.pm', - macDingbats => 'Encode/Symbol.pm', - macSymbol => 'Encode/Symbol.pm', - symbol => 'Encode/Symbol.pm', - viscii => 'Encode/Byte.pm', + 'koi8-r' => 'Encode::Byte', + 'posix-bc' => 'Encode::EBCDIC', + cp37 => 'Encode::EBCDIC', + cp1026 => 'Encode::EBCDIC', + cp1047 => 'Encode::EBCDIC', + cp500 => 'Encode::EBCDIC', + cp875 => 'Encode::EBCDIC', + dingbats => 'Encode::Symbol', + macDingbats => 'Encode::Symbol', + macSymbol => 'Encode::Symbol', + symbol => 'Encode::Symbol', + viscii => 'Encode::Byte', ); unless ($ON_EBCDIC) { # CJK added to autoload unless EBCDIC env %ExtModule = (%ExtModule, - 'cp936' => 'Encode/CN.pm', - 'euc-cn' => 'Encode/CN.pm', - 'gb12345-raw' => 'Encode/CN.pm', - 'gb2312-raw' => 'Encode/CN.pm', - 'gbk' => 'Encode/CN.pm', - 'iso-ir-165' => 'Encode/CN.pm', - - '7bit-jis' => 'Encode/JP.pm', - 'cp932' => 'Encode/JP.pm', - 'euc-jp' => 'Encode/JP.pm', - 'iso-2022-jp' => 'Encode/JP.pm', - 'iso-2022-jp-1' => 'Encode/JP.pm', - 'jis0201-raw' => 'Encode/JP.pm', - 'jis0208-raw' => 'Encode/JP.pm', - 'jis0212-raw' => 'Encode/JP.pm', - 'macJapanese' => 'Encode/JP.pm', - 'shiftjis' => 'Encode/JP.pm', - - 'cp949' => 'Encode/KR.pm', - 'euc-kr' => 'Encode/KR.pm', - 'ksc5601' => 'Encode/KR.pm', - 'macKorean' => 'Encode/KR.pm', - - 'big5' => 'Encode/TW.pm', - 'big5-hkscs' => 'Encode/TW.pm', - 'cp950' => 'Encode/TW.pm', - - 'big5plus' => 'Encode/HanExtra.pm', - 'euc-tw' => 'Encode/HanExtra.pm', - 'gb18030' => 'Encode/HanExtra.pm', + 'cp936' => 'Encode::CN', + 'euc-cn' => 'Encode::CN', + 'gb12345-raw' => 'Encode::CN', + 'gb2312-raw' => 'Encode::CN', + 'gbk' => 'Encode::CN', + 'iso-ir-165' => 'Encode::CN', + + '7bit-jis' => 'Encode::JP', + 'cp932' => 'Encode::JP', + 'euc-jp' => 'Encode::JP', + 'iso-2022-jp' => 'Encode::JP', + 'iso-2022-jp-1' => 'Encode::JP', + 'jis0201-raw' => 'Encode::JP', + 'jis0208-raw' => 'Encode::JP', + 'jis0212-raw' => 'Encode::JP', + 'macJapanese' => 'Encode::JP', + 'shiftjis' => 'Encode::JP', + + 'cp949' => 'Encode::KR', + 'euc-kr' => 'Encode::KR', + 'ksc5601' => 'Encode::KR', + 'macKorean' => 'Encode::KR', + + 'big5' => 'Encode::TW', + 'big5-hkscs' => 'Encode::TW', + 'cp950' => 'Encode::TW', + + 'big5plus' => 'Encode::HanExtra', + 'euc-tw' => 'Encode::HanExtra', + 'gb18030' => 'Encode::HanExtra', ); } @@ -129,16 +134,15 @@ sub encodings { my $class = shift; my @modules = (@_ and $_[0] eq ":all") ? values %ExtModule : @_; - for my $m (@modules) - { - $DEBUG and warn "about to require $m;"; - eval { require $m; }; + for my $mod (@modules){ + $mod =~ s,::,/,g or $mod = "Encode/$mod"; + $mod .= '.pm'; + $DEBUG and warn "about to require $mod;"; + eval { require $mod; }; } + my %modules = map {$_ => 1} @modules; return - map({$_->[0]} - sort({$a->[1] cmp $b->[1]} - map({[$_, lc $_]} - grep({ $_ ne 'Internal' } keys %Encoding)))); + sort grep {!/^(?:Internal|Unicode)$/o} keys %Encoding; } sub define_encoding @@ -180,12 +184,14 @@ sub getEncoding $oc = $class->find_alias($lc) if $lc ne $name; return $oc if defined $oc; - if (!$skip_external and exists $ExtModule{$lc}) + unless ($skip_external) { - eval{ require $ExtModule{$lc}; }; - return $Encoding{$name} if exists $Encoding{$name}; + if (my $mod = $ExtModule{$name} || $ExtModule{$lc}){ + $mod =~ s,::,/,g ; $mod .= '.pm'; + eval{ require $mod; }; + return $Encoding{$name} if exists $Encoding{$name}; + } } - return; } @@ -232,7 +238,7 @@ sub from_to sub encode_utf8 { my ($str) = @_; - utf8::encode($str); + utf8::encode($str); return $str; } @@ -274,8 +280,8 @@ sub predefine_encodings{ $_[1] = '' if $chk; return $res; }; - $Encode::Encoding{Unicode} = - bless {Name => "UTF_EBCDIC"}, "Encode::UTF_EBCDIC"; + $Encode::Encoding{Internal} = + bless {Name => "UTF_EBCDIC"} => "Encode::UTF_EBCDIC"; } else { # was in Encode::UTF_EBCDIC package Encode::Internal; @@ -289,7 +295,7 @@ sub predefine_encodings{ }; *encode = \&decode; $Encode::Encoding{Unicode} = - bless {Name => "Internal"}, "Encode::Internal"; + bless {Name => "Internal"} => "Encode::Internal"; } { @@ -313,12 +319,12 @@ sub predefine_encodings{ return $octets; }; $Encode::Encoding{utf8} = - bless {Name => "utf8"}, "Encode::utf8"; + bless {Name => "utf8"} => "Encode::utf8"; } # do externals if necessary require File::Basename; require File::Spec; - for my $ext (qw(Unicode)){ + for my $ext (qw()){ my $pm = File::Spec->catfile(File::Basename::dirname($INC{'Encode.pm'}), "Encode", "$ext.pm"); @@ -501,10 +507,11 @@ ones that are not loaded yet, say Or you can give the name of specific module. - @with_jp = Encode->encodings("Encode/JP.pm"); + @with_jp = Encode->encodings("Encode::JP"); + +When "::" is not in the name, "Encode::" is assumed. -Note in this case you have to say C<"Encode/JP.pm"> instead of -C<"Encode::JP">. + @ebcdic = Encode->encodings("EBCDIC"); To find which encodings are supported by this package in details, see L. diff --git a/ext/Encode/MANIFEST b/ext/Encode/MANIFEST index d45f8e6..849baea 100644 --- a/ext/Encode/MANIFEST +++ b/ext/Encode/MANIFEST @@ -6,7 +6,7 @@ CN/Makefile.PL Encode extension Changes Change Log EBCDIC/EBCDIC.pm Encode extension EBCDIC/Makefile.PL Encode extension -Encode.pm Encode extension +Encode.pm Mother of all Encode extensions Encode.xs Encode extension Encode/Changes.e2x Skeleton file for enc2xs Encode/Makefile_PL.e2x Skeleton file for enc2xs @@ -34,6 +34,7 @@ encoding.pm Perl Pragmactic Module lib/Encode/Alias.pm Encode extension lib/Encode/CJKConstants.pm Encode extension lib/Encode/CN/HZ.pm Encode extension +lib/Encode/Encoder.pm OO Encoder lib/Encode/Encoding.pm Encode extension lib/Encode/JP/2022_JP.pm Encode extension lib/Encode/JP/2022_JP1.pm Encode extension diff --git a/ext/Encode/lib/Encode/Alias.pm b/ext/Encode/lib/Encode/Alias.pm index dc79b85..dd7012f 100644 --- a/ext/Encode/lib/Encode/Alias.pm +++ b/ext/Encode/lib/Encode/Alias.pm @@ -1,7 +1,7 @@ package Encode::Alias; use strict; use Encode; -our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; our $DEBUG = 0; require Exporter; @@ -178,7 +178,7 @@ sub init_aliases define_alias( qr/\b(?:ibm|ms|windows)[-_]?(\d\d\d\d?)$/i => '"cp$1"'); # Sometimes seen with a leading zero. - define_alias( qr/\bcp037\b/i => '"cp37"'); + # define_alias( qr/\bcp037\b/i => '"cp37"'); # Mac Mappings # predefined in *.ucm; unneeded diff --git a/ext/Encode/lib/Encode/Encoder.pm b/ext/Encode/lib/Encode/Encoder.pm new file mode 100644 index 0000000..1fc65ea --- /dev/null +++ b/ext/Encode/lib/Encode/Encoder.pm @@ -0,0 +1,195 @@ +# +# $Id: Encoder.pm,v 0.1 2002/04/08 02:35:10 dankogai Exp $ +# +package Encoder; +use strict; +our $VERSION = do { my @r = (q$Revision: 0.1 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; + +require Exporter; +our @ISA = qw(Exporter); +# Public, encouraged API is exported by default +our @EXPORT = qw ( + encoder +); + +our $AUTOLOAD; +our $DEBUG = 0; +use Encode qw(encode decode find_encoding from_to); +use Carp; + +sub new{ + my ($class, $data, $encname) = @_; + $encname ||= 'utf8'; + my $obj = find_encoding($encname) + or croak __PACKAGE__, ": unknown encoding: $encname"; + my $self = { + data => $data, + encoding => $obj->name, + }; + bless $self => $class; +} + +sub encoder{ shift->new(@_) } + +sub data{ + my ($self, $data) = shift; + defined $data and $self->{data} = $data; + return $self; +} + +sub encoding{ + my ($self, $encname) = @_; + if ($encname){ + my $obj = find_encoding($encname) + or confess __PACKAGE__, ": unknown encoding: $encname"; + $self->{encoding} = $obj->name; + } + $self; +} + +sub AUTOLOAD { + my $self = shift; + my $type = ref($self) + or confess "$self is not an object"; + my $myname = $AUTOLOAD; + $myname =~ s/.*://; # strip fully-qualified portion + my $obj = find_encoding($myname) + or confess __PACKAGE__, ": unknown encoding: $myname"; + $DEBUG and warn $self->{encoding}, " => ", $obj->name; + from_to($self->{data}, $self->{encoding}, $obj->name, 1); + $self->{encoding} = $obj->name; + return $self; +} + +use overload + q("") => sub { $_[0]->{data} }, + q(0+) => sub { use bytes (); bytes::length($_[0]->{data}) }, + fallback => 1, + ; + +1; +__END__ + +=head1 NAME + +Encode::Encoder -- Object Oriented Encoder + +=head1 SYNOPSIS + + use Encode::Encoder; + # Encode::encode("ISO-8859-1", $data); + Encoder->new($data)->iso_8859_1; # OOP way + # shortcut + encoder($data)->iso_8859_1; + # you can stack them! + encoder($data)->iso_8859_1->base64; # provided base64() is defined + # stringified + print encoder($utf8)->latin1 # prints the string in latin1 + # numified + encoder("\x{abcd}\x{ef}g") == 6; # true. bytes::length($data) + +=head1 ABSTRACT + +B allows you to use Encode via OOP style. This is +not only more intuitive than functional approach, but also handier +when you want to stack encodings. Suppose you want your UTF-8 string +converted to Latin1 then Base64, you can simply say + + my $base64 = encoder($utf8)->latin1->base64; + +instead of + + my $latin1 = encode("latin1", $utf8); + +or lazier and convolted + + my $base64 = encode_base64(encode("latin1", $utf8)); + +=head1 Description + +Here is how to use this module. + +=over 4 + +=item * + +There are at least two instance variable stored in hash reference, +{data} and {encoding}. + +=item * + +When there is no method, it takes the method name as the name of +encoding and encode instance I with I. If successful, +instance I is set accordingly. + +=item * + +This module is desined to work with L. +To make the Base64 transcorder example above really work, you should +write a module like this. + + package Encode::Base64; + use base 'Encode::Encoding'; + __PACKAGE->Define('base64'); + use MIME::Base64; + sub encode{ + my ($obj, $data) = @_; + return encode_base64($data); + } + sub decode{ + my ($obj, $data) = @_; + return decode_base64($data); + } + 1; + __END__ + +And your caller module should be like this; + + use Encode::Encoder; + use Encode::Base64; + # and be creative. + +=head2 operator overloading + +This module overloads two operators, stringify ("") and numify (0+). + +Stringify dumps the data therein. + +Numify returns the number of bytes therein. + +They come in handy when you want to print or find the size of data. + +=back + +=head2 Predefined Methods + +This module predefines the methods below; + +=over 4 + +=item $e = Encode::Encoder-Enew([$data, $encoding]); + +returns the encoder object. Its data is initialized with $data if +there, and its encoding is set to $encoding if there. + +=item encoder() + +is an alias of Encode::Encoder-Enew(). This one is exported for +convenience. + +=item $e-Edata($data) + +sets instance data to $data. + +=item $e-Eencoding($encoding) + +sets instance encoding to $encoding + +=back + +=head1 SEE ALSO + +L +L + +=cut diff --git a/ext/Encode/lib/Encode/Supported.pod b/ext/Encode/lib/Encode/Supported.pod index a0beca3..0517f1a 100644 --- a/ext/Encode/lib/Encode/Supported.pod +++ b/ext/Encode/lib/Encode/Supported.pod @@ -63,6 +63,15 @@ The following encodings are always available. ascii US-ascii [ECMA] iso-8859-1 latin1 [ISO] utf8 UTF-8 [RFC2279] + ---------------------------------------------------------------- + + +=head2 Encode::Unicode -- other Unicode encodings + +Unicode coding schemes other than native utf8 are supported by +Encode::Unicode which will be autoloaded on demand. + + ---------------------------------------------------------------- UCS-2BE UCS-2, iso-10646-1 [IANA, UC] UCS-2LE [UC] UTF-16 [UC] @@ -454,7 +463,7 @@ Encoding names are registered to IANA as preferred MIME names and may probably be used over the Internet. -C has been officialized by JIS X 0208-1997. +C has been officialized by JIS X 0208:1997. L gives details. C is the IANA name for C. @@ -494,17 +503,17 @@ then C support =item * -data coded with C seamlessly passes traditional -command piping (C, C, etc.) while UTF-16 coded +C coded data seamlessly passes traditional +command piping (C, C, etc.) while C coded data is likely to cause confusion (with it's zero bytes, for example) =item * it is beyond the power of words to describe the way HTML browsers -encode non-C form data. To get a general impression refer to +encode non-C form data. To get a general impression visit L. -While encoding of form data has stabilzed for C coded pages +While encoding of form data has stabilized for C coded pages (at least IE 5/6, NS 6, Opera 6 behave consitently), be sure to expect fun (and cross-browser discrepancies) with C coded pages! @@ -512,7 +521,7 @@ pages! =back The rule of thumb is to use C unless you know what -you're doing and unless you really need from using C. +you're doing and unless you really benefit from using C. ISO-IR-165 [RFC1345] @@ -541,7 +550,7 @@ Microsoft products misuse the following names: Microsoft extension to C. -Proper name: C. +Proper names: C, C, C (as used by Mozilla). See L for details. @@ -583,14 +592,15 @@ Microsoft's understanding of C. JIS has not endorsed the full Microsoft standard however. The official C includes only JIS X 0201 and JIS X 0208 subsets, while Microsoft has always been meaning C to -encode a wider character repertoire. +encode a wider character repertoire, see C registration for +C. As a historical predecessor Microsoft's variant probably has more rights for the name, albeit it may be objected that Microsoft shouldn't have used JIS as part of the name in the first place. -Unabiguous name: C. +Unabiguous name: C. C name (not used?): C. Encode separately supports C and C. diff --git a/ext/Encode/lib/Encode/Unicode.pm b/ext/Encode/lib/Encode/Unicode.pm index 1bbd9db..c90e8b6 100644 --- a/ext/Encode/lib/Encode/Unicode.pm +++ b/ext/Encode/lib/Encode/Unicode.pm @@ -3,7 +3,7 @@ package Encode::Unicode; use strict; use warnings; -our $VERSION = do { my @r = (q$Revision: 1.25 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; +our $VERSION = do { my @r = (q$Revision: 1.26 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r }; # # Aux. subs & constants @@ -66,7 +66,7 @@ for my $name (qw(UTF-16 UTF-16BE UTF-16LE size => $size, endian => $endian, ucs2 => $ucs2, - }, __PACKAGE__; + } => __PACKAGE__; } @@ -387,7 +387,8 @@ Galaxy> Triology, C. Their mistake was not this magnitude so let's forgive them. (I don't dare make any comparison with Unicode Consortium and the -Vogols here :) +Vogons here ;) Or, comparing Encode to Babel Fish is completely +appropriate -- if you can only stick this into your ear :) A surrogate pair was born when Unicode Consortium had finally admitted that 16 bit was not big enough to hold all the world's diff --git a/ext/Encode/t/Unicode.t b/ext/Encode/t/Unicode.t index 75486ad..ea52f15 100644 --- a/ext/Encode/t/Unicode.t +++ b/ext/Encode/t/Unicode.t @@ -1,33 +1,34 @@ # -# $Id: Unicode.t,v 1.2 2002/04/07 17:22:31 dankogai Exp dankogai $ +# $Id: Unicode.t,v 1.4 2002/04/08 02:35:48 dankogai Exp dankogai $ # # This script is written entirely in ASCII, even though quoted literals # do include non-BMP unicode characters -- Are you happy, jhi? # BEGIN { - require Config; import Config; - if ($Config{'extensions'} !~ /\bEncode\b/) { - print "1..0 # Skip: Encode was not built\n"; - exit 0; - } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } # should work without perlio # unless (find PerlIO::Layer 'perlio') { -# print "1..0 # Skip: PerlIO was not built\n"; -# exit 0; +# print "1..0 # Skip: PerlIO was not built\n"; +# exit 0; # } # should work on EBCDIC # if (ord("A") == 193) { -# print "1..0 # Skip: EBCDIC\n"; -# exit 0; +# print "1..0 # Skip: EBCDIC\n"; +# exit 0; # } - $| = 1; + $| = 1; } use strict; #use Test::More 'no_plan'; use Test::More tests => 22; use Encode qw(encode decode); +use Encode::Unicode; # to load BOM defs # # see @@ -40,18 +41,18 @@ my $fallback = "\x{004D}\x{0061}\x{fffd}"; #hi: (0x1abcd - 0x10000) / 0x400 + 0xD800 = 0xd82a #lo: (0x1abcd - 0x10000) % 0x400 + 0xDC00 = 0xdfcd -my $n_16be = - pack("C*", map {hex($_)} qw<00 4D 00 61 d8 2a df cd>); +my $n_16be = + pack("C*", map {hex($_)} qw<00 4D 00 61 d8 2a df cd>); my $n_16le = - pack("C*", map {hex($_)} qw<4D 00 61 00 2a d8 cd df>); -my $f_16be = - pack("C*", map {hex($_)} qw<00 4D 00 61 ff fd>); + pack("C*", map {hex($_)} qw<4D 00 61 00 2a d8 cd df>); +my $f_16be = + pack("C*", map {hex($_)} qw<00 4D 00 61 ff fd>); my $f_16le = - pack("C*", map {hex($_)} qw<4D 00 61 00 fd ff>); -my $n_32be = - pack("C*", map {hex($_)} qw<00 00 00 4D 00 00 00 61 00 01 ab cd>); -my $n_32le = - pack("C*", map {hex($_)} qw<4D 00 00 00 61 00 00 00 cd ab 01 00>); + pack("C*", map {hex($_)} qw<4D 00 61 00 fd ff>); +my $n_32be = + pack("C*", map {hex($_)} qw<00 00 00 4D 00 00 00 61 00 01 ab cd>); +my $n_32le = + pack("C*", map {hex($_)} qw<4D 00 00 00 61 00 00 00 cd ab 01 00>); my $n_16bb = pack('n', Encode::Unicode::BOM_BE) . $n_16be; my $n_16lb = pack('n', Encode::Unicode::BOM16LE) . $n_16le; @@ -77,16 +78,20 @@ is($nasty, decode('UTF-32', $n_32lb), qq{decode UTF-32, bom=le}); is(decode('UCS-2BE', $n_16be), $fallback, "decode UCS-2BE: fallback"); is(decode('UCS-2LE', $n_16le), $fallback, "decode UCS-2LE: fallback"); -eval { decode('UCS-2BE', $n_16be, 1) }; +eval { decode('UCS-2BE', $n_16be, 1) }; ok($@=~/^UCS-2BE:/, "decode UCS-2BE: exception"); -eval { decode('UCS-2LE', $n_16le, 1) }; +eval { decode('UCS-2LE', $n_16le, 1) }; ok($@=~/^UCS-2LE:/, "decode UCS-2LE: exception"); is(encode('UCS-2BE', $nasty), $f_16be, "encode UCS-2BE: fallback"); is(encode('UCS-2LE', $nasty), $f_16le, "encode UCS-2LE: fallback"); -eval { encode('UCS-2BE', $nasty, 1) }; +eval { encode('UCS-2BE', $nasty, 1) }; ok($@=~/^UCS-2BE:/, "encode UCS-2BE: exception"); -eval { encode('UCS-2LE', $nasty, 1) }; +eval { encode('UCS-2LE', $nasty, 1) }; ok($@=~/^UCS-2LE:/, "encode UCS-2LE: exception"); 1; __END__ + +use Devel::Peek; +my $foo = decode('UTF-16BE', $n_16be); +Dump $n_16be; Dump $foo; diff --git a/ext/Encode/t/grow.t b/ext/Encode/t/grow.t index 8cf2f03..e6b35fc 100644 --- a/ext/Encode/t/grow.t +++ b/ext/Encode/t/grow.t @@ -37,42 +37,3 @@ for my $i (1..$POWER){ __END__ -#!../perl -our $POWER; -BEGIN { - if ($ENV{'PERL_CORE'}){ - chdir 't'; - unshift @INC, '../lib'; - } - require Config; import Config; - if ($Config{'extensions'} !~ /\bEncode\b/) { - print "1..0 # Skip: Encode was not built\n"; - exit 0; - } - $POWER = 12; # up to 1 MB. You may adjust the figure here -} - -use strict; -use Encode; - -my $seed = ""; -for my $i (0x00..0xff){ - my $c = chr($i); - $seed .= ($c =~ /^\p{IsPrint}/o) ? $c : " "; -} - -use Test::More tests => $POWER*2; -my $octs = $seed; -use bytes (); -for my $i (1..$POWER){ - $octs .= $octs; - my $len = bytes::length($octs); - my $utf8 = Encode::decode('latin1', $octs); - ok(1, "decode $len bytes"); - is($octs, - Encode::encode('latin1', $utf8), - "encode $len bytes"); -} -__END__ - - -- 1.8.3.1