9 use File::Basename qw[dirname];
13 use IPC::Cmd qw[can_run run QUOTE];
14 use File::Path qw[mkpath];
15 use File::Temp qw[tempdir];
16 use Params::Check qw[check];
17 use Module::Load::Conditional qw[can_load];
18 use Locale::Maketext::Simple Style => 'gettext';
20 use vars qw[ $VERBOSE $PREFER_BIN $FROM_EMAIL $USER_AGENT
21 $BLACKLIST $METHOD_FAIL $VERSION $METHODS
22 $FTP_PASSIVE $TIMEOUT $DEBUG $WARN
26 $VERSION = eval $VERSION; # avoid warnings with development releases
27 $PREFER_BIN = 0; # XXX TODO implement
28 $FROM_EMAIL = 'File-Fetch@example.com';
29 $USER_AGENT = "File::Fetch/$VERSION";
30 $BLACKLIST = [qw|ftp|];
37 ### methods available to fetch the file depending on the scheme
39 http => [ qw|lwp httptiny wget curl lftp fetch httplite lynx iosock| ],
40 ftp => [ qw|lwp netftp wget curl lftp fetch ncftp ftp| ],
41 file => [ qw|lwp lftp file| ],
42 rsync => [ qw|rsync| ],
46 ### silly warnings ###
47 local $Params::Check::VERBOSE = 1;
48 local $Params::Check::VERBOSE = 1;
49 local $Module::Load::Conditional::VERBOSE = 0;
50 local $Module::Load::Conditional::VERBOSE = 0;
52 ### see what OS we are on, important for file:// uris ###
53 use constant ON_WIN => ($^O eq 'MSWin32');
54 use constant ON_VMS => ($^O eq 'VMS');
55 use constant ON_UNIX => (!ON_WIN);
56 use constant HAS_VOL => (ON_WIN);
57 use constant HAS_SHARE => (ON_WIN);
58 use constant HAS_FETCH => ( $^O =~ m!^(freebsd|netbsd|dragonfly)$! );
64 File::Fetch - A generic file fetching mechanism
70 ### build a File::Fetch object ###
71 my $ff = File::Fetch->new(uri => 'http://some.where.com/dir/a.txt');
73 ### fetch the uri to cwd() ###
74 my $where = $ff->fetch() or die $ff->error;
76 ### fetch the uri to /tmp ###
77 my $where = $ff->fetch( to => '/tmp' );
79 ### parsed bits from the uri ###
88 File::Fetch is a generic file fetching mechanism.
90 It allows you to fetch any file pointed to by a C<ftp>, C<http>,
91 C<file>, C<git> or C<rsync> uri by a number of different means.
93 See the C<HOW IT WORKS> section further down for details.
97 A C<File::Fetch> object has the following accessors
103 The uri you passed to the constructor
107 The scheme from the uri (like 'file', 'http', etc)
111 The hostname in the uri. Will be empty if host was originally
112 'localhost' for a 'file://' url.
116 On operating systems with the concept of a volume the second element
117 of a file:// is considered to the be volume specification for the file.
118 Thus on Win32 this routine returns the volume, on other operating
119 systems this returns nothing.
121 On Windows this value may be empty if the uri is to a network share, in
122 which case the 'share' property will be defined. Additionally, volume
123 specifications that use '|' as ':' will be converted on read to use ':'.
125 On VMS, which has a volume concept, this field will be empty because VMS
126 file specifications are converted to absolute UNIX format and the volume
127 information is transparently included.
131 On systems with the concept of a network share (currently only Windows) returns
132 the sharename from a file://// url. On other operating systems returns empty.
136 The path from the uri, will be at least a single '/'.
140 The name of the remote file. For the local file name, the
141 result of $ff->output_file will be used.
143 =item $ff->file_default
145 The name of the default local file, that $ff->output_file falls back to if
146 it would otherwise return no filename. For example when fetching a URI like
147 http://www.abc.net.au/ the contents retrieved may be from a remote file called
148 'index.html'. The default value of this attribute is literally 'file_default'.
153 ##########################
154 ### Object & Accessors ###
155 ##########################
158 ### template for autogenerated accessors ###
160 scheme => { default => 'http' },
161 host => { default => 'localhost' },
162 path => { default => '/' },
163 file => { required => 1 },
164 uri => { required => 1 },
165 vol => { default => '' }, # windows for file:// uris
166 share => { default => '' }, # windows for file:// uris
167 file_default => { default => 'file_default' },
168 tempdir_root => { required => 1 }, # Should be lazy-set at ->new()
169 _error_msg => { no_override => 1 },
170 _error_msg_long => { no_override => 1 },
173 for my $method ( keys %$Tmpl ) {
177 $self->{$method} = $_[0] if @_;
178 return $self->{$method};
186 my $args = check( $Tmpl, \%hash ) or return;
190 if( lc($args->scheme) ne 'file' and not $args->host ) {
191 return $class->_error(loc(
192 "Hostname required when fetching from '%1'",$args->scheme));
196 unless( $args->$_() ) { # 5.5.x needs the ()
197 return $class->_error(loc("No '%1' specified",$_));
205 =item $ff->output_file
207 The name of the output file. This is the same as $ff->file,
208 but any query parameters are stripped off. For example:
210 http://example.com/index.html?x=y
212 would make the output file be C<index.html> rather than
221 my $file = $self->file;
225 $file ||= $self->file_default;
230 ### XXX do this or just point to URI::Escape?
231 # =head2 $esc_uri = $ff->escaped_uri
235 # ### most of this is stolen straight from URI::escape
236 # { ### Build a char->hex map
237 # my %escapes = map { chr($_) => sprintf("%%%02X", $_) } 0..255;
241 # my $uri = $self->uri;
243 # ### Default unsafe characters. RFC 2732 ^(uric - reserved)
244 # $uri =~ s/([^A-Za-z0-9\-_.!~*'()])/
245 # $escapes{$1} || $self->_fail_hi($1)/ge;
255 # "Can't escape '%1', try using the '%2' module instead",
256 # sprintf("\\x{%04X}", ord($char)), 'URI::Escape'
269 =head2 $ff = File::Fetch->new( uri => 'http://some.where.com/dir/file.txt' );
271 Parses the uri and creates a corresponding File::Fetch::Item object,
272 that is ready to be C<fetch>ed and returns it.
274 Returns false on failure.
282 my ($uri, $file_default, $tempdir_root);
284 uri => { required => 1, store => \$uri },
285 file_default => { required => 0, store => \$file_default },
286 tempdir_root => { required => 0, store => \$tempdir_root },
289 check( $tmpl, \%hash ) or return;
291 ### parse the uri to usable parts ###
292 my $href = $class->_parse_uri( $uri ) or return;
294 $href->{file_default} = $file_default if $file_default;
295 $href->{tempdir_root} = File::Spec->rel2abs( $tempdir_root ) if $tempdir_root;
296 $href->{tempdir_root} = File::Spec->rel2abs( Cwd::cwd ) if not $href->{tempdir_root};
298 ### make it into a FFI object ###
299 my $ff = $class->_create( %$href ) or return;
302 ### return the object ###
306 ### parses an uri to a hash structure:
308 ### $class->_parse_uri( 'ftp://ftp.cpan.org/pub/mirror/index.txt' )
314 ### host => 'ftp.cpan.org',
315 ### path => '/pub/mirror',
316 ### file => 'index.html'
319 ### In the case of file:// urls there maybe be additional fields
321 ### For systems with volume specifications such as Win32 there will be
322 ### a volume specifier provided in the 'vol' field.
324 ### 'vol' => 'volumename'
326 ### For windows file shares there may be a 'share' key specified
328 ### 'share' => 'sharename'
330 ### Note that the rules of what a file:// url means vary by the operating system
331 ### of the host being addressed. Thus file:///d|/foo/bar.txt means the obvious
332 ### 'D:\foo\bar.txt' on windows, but on unix it means '/d|/foo/bar.txt' and
333 ### not '/foo/bar.txt'
335 ### Similarly if the host interpreting the url is VMS then
336 ### file:///disk$user/my/notes/note12345.txt' means
337 ### 'DISK$USER:[MY.NOTES]NOTE123456.TXT' but will be returned the same as
338 ### if it is unix where it means /disk$user/my/notes/note12345.txt'.
339 ### Except for some cases in the File::Spec methods, Perl on VMS will generally
340 ### handle UNIX format file specifications.
342 ### This means it is impossible to serve certain file:// urls on certain systems.
344 ### Thus are the problems with a protocol-less specification. :-(
349 my $uri = shift or return;
351 my $href = { uri => $uri };
353 ### find the scheme ###
354 $uri =~ s|^(\w+)://||;
355 $href->{scheme} = $1;
357 ### See rfc 1738 section 3.10
358 ### http://www.faqs.org/rfcs/rfc1738.html
359 ### And wikipedia for more on windows file:// urls
360 ### http://en.wikipedia.org/wiki/File://
361 if( $href->{scheme} eq 'file' ) {
363 my @parts = split '/',$uri;
365 ### file://hostname/...
366 ### file://hostname/...
367 ### normalize file://localhost with file:///
368 $href->{host} = $parts[0] || '';
370 ### index in @parts where the path components begin;
373 ### file:////hostname/sharename/blah.txt
374 if ( HAS_SHARE and not length $parts[0] and not length $parts[1] ) {
376 $href->{host} = $parts[2] || ''; # avoid warnings
377 $href->{share} = $parts[3] || ''; # avoid warnings
379 $index = 4 # index after the share
381 ### file:///D|/blah.txt
382 ### file:///D:/blah.txt
385 ### this code comes from dmq's patch, but:
386 ### XXX if volume is empty, wouldn't that be an error? --kane
387 ### if so, our file://localhost test needs to be fixed as wel
388 $href->{vol} = $parts[1] || '';
390 ### correct D| style colume descriptors
391 $href->{vol} =~ s/\A([A-Z])\|\z/$1:/i if ON_WIN;
393 $index = 2; # index after the volume
396 ### rebuild the path from the leftover parts;
397 $href->{path} = join '/', '', splice( @parts, $index, $#parts );
400 ### using anything but qw() in hash slices may produce warnings
401 ### in older perls :-(
402 @{$href}{ qw(host path) } = $uri =~ m|([^/]*)(/.*)$|s;
405 ### split the path into file + dir ###
406 { my @parts = File::Spec::Unix->splitpath( delete $href->{path} );
407 $href->{path} = $parts[1];
408 $href->{file} = $parts[2];
411 ### host will be empty if the target was 'localhost' and the
412 ### scheme was 'file'
413 $href->{host} = '' if ($href->{host} eq 'localhost') and
414 ($href->{scheme} eq 'file');
419 =head2 $where = $ff->fetch( [to => /my/output/dir/ | \$scalar] )
421 Fetches the file you requested and returns the full path to the file.
423 By default it writes to C<cwd()>, but you can override that by specifying
426 ### file fetch to /tmp, full path to the file in $where
427 $where = $ff->fetch( to => '/tmp' );
429 ### file slurped into $scalar, full path to the file in $where
430 ### file is downloaded to a temp directory and cleaned up at exit time
431 $where = $ff->fetch( to => \$scalar );
433 Returns the full path to the downloaded file on success, and false
439 my $self = shift or return;
444 to => { default => cwd(), store => \$target },
447 check( $tmpl, \%hash ) or return;
450 ### you want us to slurp the contents
451 if( ref $target and UNIVERSAL::isa( $target, 'SCALAR' ) ) {
452 $to = tempdir( 'FileFetch.XXXXXX', DIR => $self->tempdir_root, CLEANUP => 1 );
458 ### On VMS force to VMS format so File::Spec will work.
459 $to = VMS::Filespec::vmspath($to) if ON_VMS;
461 ### create the path if it doesn't exist yet ###
463 eval { mkpath( $to ) };
465 return $self->_error(loc("Could not create path '%1'",$to)) if $@;
469 ### set passive ftp if required ###
470 local $ENV{FTP_PASSIVE} = $FTP_PASSIVE;
472 ### we dont use catfile on win32 because if we are using a cygwin tool
473 ### under cmd.exe they wont understand windows style separators.
474 my $out_to = ON_WIN ? $to.'/'.$self->output_file
475 : File::Spec->catfile( $to, $self->output_file );
477 for my $method ( @{ $METHODS->{$self->scheme} } ) {
478 my $sub = '_'.$method.'_fetch';
480 unless( __PACKAGE__->can($sub) ) {
481 $self->_error(loc("Cannot call method for '%1' -- WEIRD!",
486 ### method is blacklisted ###
487 next if grep { lc $_ eq $method } @$BLACKLIST;
489 ### method is known to fail ###
490 next if $METHOD_FAIL->{$method};
492 ### there's serious issues with IPC::Run and quoting of command
493 ### line arguments. using quotes in the wrong place breaks things,
494 ### and in the case of say,
495 ### C:\cygwin\bin\wget.EXE --quiet --passive-ftp --output-document
496 ### "index.html" "http://www.cpan.org/index.html?q=1&y=2"
497 ### it doesn't matter how you quote, it always fails.
498 local $IPC::Cmd::USE_IPC_RUN = 0;
500 if( my $file = $self->$sub(
504 unless( -e $file && -s _ ) {
505 $self->_error(loc("'%1' said it fetched '%2', ".
506 "but it was not created",$method,$file));
508 ### mark the failure ###
509 $METHOD_FAIL->{$method} = 1;
516 if( ref $target and UNIVERSAL::isa( $target, 'SCALAR' ) ) {
519 open my $fh, "<$file" or do {
521 loc("Could not open '%1': %2", $file, $!));
526 $$target = do { local $/; <$fh> };
530 my $abs = File::Spec->rel2abs( $file );
538 ### if we got here, we looped over all methods, but we weren't able
543 ########################
544 ### _*_fetch methods ###
545 ########################
554 to => { required => 1, store => \$to }
556 check( $tmpl, \%hash ) or return;
558 ### modules required to download with lwp ###
561 'LWP::UserAgent' => '0.0',
562 'HTTP::Request' => '0.0',
563 'HTTP::Status' => '0.0',
568 unless( can_load( modules => $use_list ) ) {
569 $METHOD_FAIL->{'lwp'} = 1;
573 ### setup the uri object
574 my $uri = URI->new( File::Spec::Unix->catfile(
575 $self->path, $self->file
578 ### special rules apply for file:// uris ###
579 $uri->scheme( $self->scheme );
580 $uri->host( $self->scheme eq 'file' ? '' : $self->host );
581 $uri->userinfo("anonymous:$FROM_EMAIL") if $self->scheme ne 'file';
583 ### set up the useragent object
584 my $ua = LWP::UserAgent->new();
585 $ua->timeout( $TIMEOUT ) if $TIMEOUT;
586 $ua->agent( $USER_AGENT );
587 $ua->from( $FROM_EMAIL );
590 my $res = $ua->mirror($uri, $to) or return;
592 ### uptodate or fetched ok ###
593 if ( $res->code == 304 or $res->code == 200 ) {
597 return $self->_error(loc("Fetch failed! HTTP response: %1 %2 [%3]",
598 $res->code, HTTP::Status::status_message($res->code),
604 ### HTTP::Tiny fetching ###
605 sub _httptiny_fetch {
611 to => { required => 1, store => \$to }
613 check( $tmpl, \%hash ) or return;
616 'HTTP::Tiny' => '0.008',
620 unless( can_load(modules => $use_list) ) {
621 $METHOD_FAIL->{'httptiny'} = 1;
625 my $uri = $self->uri;
627 my $http = HTTP::Tiny->new( ( $TIMEOUT ? ( timeout => $TIMEOUT ) : () ) );
629 my $rc = $http->mirror( $uri, $to );
631 unless ( $rc->{success} ) {
633 return $self->_error(loc( "Fetch failed! HTTP response: %1 [%2]",
634 $rc->{status}, $rc->{reason} ) );
642 ### HTTP::Lite fetching ###
643 sub _httplite_fetch {
649 to => { required => 1, store => \$to }
651 check( $tmpl, \%hash ) or return;
653 ### modules required to download with lwp ###
655 'HTTP::Lite' => '2.2',
659 unless( can_load(modules => $use_list) ) {
660 $METHOD_FAIL->{'httplite'} = 1;
664 my $uri = $self->uri;
667 RETRIES: while ( $retries++ < 5 ) {
669 my $http = HTTP::Lite->new();
670 # Naughty naughty but there isn't any accessor/setter
671 $http->{timeout} = $TIMEOUT if $TIMEOUT;
672 $http->http11_mode(1);
674 my $fh = FileHandle->new;
676 unless ( $fh->open($to,'>') ) {
677 return $self->_error(loc(
678 "Could not open '%1' for writing: %2",$to,$!));
685 my $rc = $http->request( $uri, sub { my ($self,$dref,$cbargs) = @_; local $\; print {$cbargs} $$dref }, $fh );
689 if ( $rc == 301 || $rc == 302 ) {
691 HEADERS: for ($http->headers_array) {
692 /Location: (\S+)/ and $loc = $1, last HEADERS;
694 #$loc or last; # Think we should squeal here.
696 $uri =~ s{^(\w+?://[^/]+)/.*$}{$1};
704 elsif ( $rc == 200 ) {
708 return $self->_error(loc("Fetch failed! HTTP response: %1 [%2]",
709 $rc, $http->status_message));
712 } # Loop for 5 retries.
714 return $self->_error("Fetch failed! Gave up after 5 tries");
718 ### Simple IO::Socket::INET fetching ###
725 to => { required => 1, store => \$to }
727 check( $tmpl, \%hash ) or return;
730 'IO::Socket::INET' => '0.0',
731 'IO::Select' => '0.0',
734 unless( can_load(modules => $use_list) ) {
735 $METHOD_FAIL->{'iosock'} = 1;
739 my $sock = IO::Socket::INET->new(
740 PeerHost => $self->host,
741 ( $self->host =~ /:/ ? () : ( PeerPort => 80 ) ),
745 return $self->_error(loc("Could not open socket to '%1', '%2'",$self->host,$!));
748 my $fh = FileHandle->new;
752 unless ( $fh->open($to,'>') ) {
753 return $self->_error(loc(
754 "Could not open '%1' for writing: %2",$to,$!));
760 my $path = File::Spec::Unix->catfile( $self->path, $self->file );
761 my $req = "GET $path HTTP/1.0\x0d\x0aHost: " . $self->host . "\x0d\x0a\x0d\x0a";
764 my $select = IO::Select->new( $sock );
768 while ( $select->can_read( $TIMEOUT || 60 ) ) {
769 my $ret = $sock->sysread( $resp, 4096, length($resp) );
770 if ( !defined $ret or $ret == 0 ) {
771 $select->remove( $sock );
778 return $self->_error(loc("Socket timed out after '%1' seconds", ( $TIMEOUT || 60 )));
781 # Check the "response"
782 # Strip preceding blank lines apparently they are allowed (RFC 2616 4.1)
783 $resp =~ s/^(\x0d?\x0a)+//;
784 # Check it is an HTTP response
785 unless ( $resp =~ m!^HTTP/(\d+)\.(\d+)!i ) {
786 return $self->_error(loc("Did not get a HTTP response from '%1'",$self->host));
790 my ($code) = $resp =~ m!^HTTP/\d+\.\d+\s+(\d+)!i;
791 unless ( $code eq '200' ) {
792 return $self->_error(loc("Got a '%1' from '%2' expected '200'",$code,$self->host));
797 print $fh +($resp =~ m/\x0d\x0a\x0d\x0a(.*)$/s )[0];
803 ### Net::FTP fetching
810 to => { required => 1, store => \$to }
812 check( $tmpl, \%hash ) or return;
814 ### required modules ###
815 my $use_list = { 'Net::FTP' => 0 };
817 unless( can_load( modules => $use_list ) ) {
818 $METHOD_FAIL->{'netftp'} = 1;
822 ### make connection ###
824 my @options = ($self->host);
825 push(@options, Timeout => $TIMEOUT) if $TIMEOUT;
826 unless( $ftp = Net::FTP->new( @options ) ) {
827 return $self->_error(loc("Ftp creation failed: %1",$@));
831 unless( $ftp->login( anonymous => $FROM_EMAIL ) ) {
832 return $self->_error(loc("Could not login to '%1'",$self->host));
835 ### set binary mode, just in case ###
838 ### create the remote path
839 ### remember remote paths are unix paths! [#11483]
840 my $remote = File::Spec::Unix->catfile( $self->path, $self->file );
842 ### fetch the file ###
844 unless( $target = $ftp->get( $remote, $to ) ) {
845 return $self->_error(loc("Could not fetch '%1' from '%2'",
846 $remote, $self->host));
856 ### /bin/wget fetch ###
863 to => { required => 1, store => \$to }
865 check( $tmpl, \%hash ) or return;
868 ### see if we have a wget binary ###
869 unless( $wget = can_run('wget') ) {
870 $METHOD_FAIL->{'wget'} = 1;
874 ### no verboseness, thanks ###
875 my $cmd = [ $wget, '--quiet' ];
877 ### if a timeout is set, add it ###
878 push(@$cmd, '--timeout=' . $TIMEOUT) if $TIMEOUT;
880 ### run passive if specified ###
881 push @$cmd, '--passive-ftp' if $FTP_PASSIVE;
883 ### set the output document, add the uri ###
884 push @$cmd, '--output-document', $to, $self->uri;
886 ### with IPC::Cmd > 0.41, this is fixed in teh library,
887 ### and there's no need for special casing any more.
888 ### DO NOT quote things for IPC::Run, it breaks stuff.
889 # $IPC::Cmd::USE_IPC_RUN
890 # ? ($to, $self->uri)
891 # : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE);
895 unless(run( command => $cmd,
896 buffer => \$captured,
899 ### wget creates the output document always, even if the fetch
900 ### fails.. so unlink it in that case
903 return $self->_error(loc( "Command failed: %1", $captured || '' ));
909 ### /bin/lftp fetch ###
916 to => { required => 1, store => \$to }
918 check( $tmpl, \%hash ) or return;
920 ### see if we have a lftp binary ###
922 unless( $lftp = can_run('lftp') ) {
923 $METHOD_FAIL->{'lftp'} = 1;
927 ### no verboseness, thanks ###
928 my $cmd = [ $lftp, '-f' ];
930 my $fh = File::Temp->new;
934 ### if a timeout is set, add it ###
935 $str .= "set net:timeout $TIMEOUT;\n" if $TIMEOUT;
937 ### run passive if specified ###
938 $str .= "set ftp:passive-mode 1;\n" if $FTP_PASSIVE;
940 ### set the output document, add the uri ###
941 ### quote the URI, because lftp supports certain shell
942 ### expansions, most notably & for backgrounding.
943 ### ' quote does nto work, must be "
944 $str .= q[get ']. $self->uri .q[' -o ]. $to . $/;
947 my $pp_str = join ' ', split $/, $str;
948 print "# lftp command: $pp_str\n";
951 ### write straight to the file.
955 ### the command needs to be 1 string to be executed
956 push @$cmd, $fh->filename;
958 ### with IPC::Cmd > 0.41, this is fixed in teh library,
959 ### and there's no need for special casing any more.
960 ### DO NOT quote things for IPC::Run, it breaks stuff.
961 # $IPC::Cmd::USE_IPC_RUN
962 # ? ($to, $self->uri)
963 # : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE);
968 unless(run( command => $cmd,
969 buffer => \$captured,
972 ### wget creates the output document always, even if the fetch
973 ### fails.. so unlink it in that case
976 return $self->_error(loc( "Command failed: %1", $captured || '' ));
984 ### /bin/ftp fetch ###
991 to => { required => 1, store => \$to }
993 check( $tmpl, \%hash ) or return;
995 ### see if we have a ftp binary ###
997 unless( $ftp = can_run('ftp') ) {
998 $METHOD_FAIL->{'ftp'} = 1;
1002 my $fh = FileHandle->new;
1004 local $SIG{CHLD} = 'IGNORE';
1006 unless ($fh->open("$ftp -n", '|-')) {
1007 return $self->_error(loc("%1 creation failed: %2", $ftp, $!));
1011 "lcd " . dirname($to),
1012 "open " . $self->host,
1013 "user anonymous $FROM_EMAIL",
1015 "cd " . $self->path,
1017 "get " . $self->file . " " . $self->output_file,
1021 foreach (@dialog) { $fh->print($_, "\n") }
1022 $fh->close or return;
1027 ### lynx is stupid - it decompresses any .gz file it finds to be text
1028 ### use /bin/lynx to fetch files
1035 to => { required => 1, store => \$to }
1037 check( $tmpl, \%hash ) or return;
1039 ### see if we have a lynx binary ###
1041 unless ( $lynx = can_run('lynx') ){
1042 $METHOD_FAIL->{'lynx'} = 1;
1046 unless( IPC::Cmd->can_capture_buffer ) {
1047 $METHOD_FAIL->{'lynx'} = 1;
1049 return $self->_error(loc(
1050 "Can not capture buffers. Can not use '%1' to fetch files",
1054 ### check if the HTTP resource exists ###
1055 if ($self->uri =~ /^https?:\/\//i) {
1060 "-auth=anonymous:$FROM_EMAIL",
1063 push @$cmd, "-connect_timeout=$TIMEOUT" if $TIMEOUT;
1065 push @$cmd, $self->uri;
1069 unless(run( command => $cmd,
1073 return $self->_error(loc("Command failed: %1", $head || ''));
1076 unless($head =~ /^HTTP\/\d+\.\d+ 200\b/) {
1077 return $self->_error(loc("Command failed: %1", $head || ''));
1081 ### write to the output file ourselves, since lynx ass_u_mes to much
1082 my $local = FileHandle->new( $to, 'w' )
1083 or return $self->_error(loc(
1084 "Could not open '%1' for writing: %2",$to,$!));
1086 ### dump to stdout ###
1090 "-auth=anonymous:$FROM_EMAIL",
1093 push @$cmd, "-connect_timeout=$TIMEOUT" if $TIMEOUT;
1095 ### DO NOT quote things for IPC::Run, it breaks stuff.
1096 push @$cmd, $self->uri;
1098 ### with IPC::Cmd > 0.41, this is fixed in teh library,
1099 ### and there's no need for special casing any more.
1100 ### DO NOT quote things for IPC::Run, it breaks stuff.
1101 # $IPC::Cmd::USE_IPC_RUN
1103 # : QUOTE. $self->uri .QUOTE;
1108 unless(run( command => $cmd,
1109 buffer => \$captured,
1112 return $self->_error(loc("Command failed: %1", $captured || ''));
1115 ### print to local file ###
1116 ### XXX on a 404 with a special error page, $captured will actually
1117 ### hold the contents of that page, and make it *appear* like the
1118 ### request was a success, when really it wasn't :(
1119 ### there doesn't seem to be an option for lynx to change the exit
1120 ### code based on a 4XX status or so.
1121 ### the closest we can come is using --error_file and parsing that,
1122 ### which is very unreliable ;(
1123 $local->print( $captured );
1124 $local->close or return;
1129 ### use /bin/ncftp to fetch files
1136 to => { required => 1, store => \$to }
1138 check( $tmpl, \%hash ) or return;
1140 ### we can only set passive mode in interactive sessions, so bail out
1141 ### if $FTP_PASSIVE is set
1142 return if $FTP_PASSIVE;
1144 ### see if we have a ncftp binary ###
1146 unless( $ncftp = can_run('ncftp') ) {
1147 $METHOD_FAIL->{'ncftp'} = 1;
1153 '-V', # do not be verbose
1154 '-p', $FROM_EMAIL, # email as password
1155 $self->host, # hostname
1156 dirname($to), # local dir for the file
1157 # remote path to the file
1158 ### DO NOT quote things for IPC::Run, it breaks stuff.
1159 $IPC::Cmd::USE_IPC_RUN
1160 ? File::Spec::Unix->catdir( $self->path, $self->file )
1161 : QUOTE. File::Spec::Unix->catdir(
1162 $self->path, $self->file ) .QUOTE
1168 unless(run( command => $cmd,
1169 buffer => \$captured,
1172 return $self->_error(loc("Command failed: %1", $captured || ''));
1179 ### use /bin/curl to fetch files
1186 to => { required => 1, store => \$to }
1188 check( $tmpl, \%hash ) or return;
1190 unless ( $curl = can_run('curl') ) {
1191 $METHOD_FAIL->{'curl'} = 1;
1195 ### these long opts are self explanatory - I like that -jmb
1196 my $cmd = [ $curl, '-q' ];
1198 push(@$cmd, '--connect-timeout', $TIMEOUT) if $TIMEOUT;
1200 push(@$cmd, '--silent') unless $DEBUG;
1202 ### curl does the right thing with passive, regardless ###
1203 if ($self->scheme eq 'ftp') {
1204 push(@$cmd, '--user', "anonymous:$FROM_EMAIL");
1207 ### curl doesn't follow 302 (temporarily moved) etc automatically
1208 ### so we add --location to enable that.
1209 push @$cmd, '--fail', '--location', '--output', $to, $self->uri;
1211 ### with IPC::Cmd > 0.41, this is fixed in teh library,
1212 ### and there's no need for special casing any more.
1213 ### DO NOT quote things for IPC::Run, it breaks stuff.
1214 # $IPC::Cmd::USE_IPC_RUN
1215 # ? ($to, $self->uri)
1216 # : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE);
1220 unless(run( command => $cmd,
1221 buffer => \$captured,
1225 return $self->_error(loc("Command failed: %1", $captured || ''));
1232 ### /usr/bin/fetch fetch! ###
1239 to => { required => 1, store => \$to }
1241 check( $tmpl, \%hash ) or return;
1243 ### see if we have a fetch binary ###
1245 unless( HAS_FETCH and $fetch = can_run('fetch') ) {
1246 $METHOD_FAIL->{'fetch'} = 1;
1250 ### no verboseness, thanks ###
1251 my $cmd = [ $fetch, '-q' ];
1253 ### if a timeout is set, add it ###
1254 push(@$cmd, '-T', $TIMEOUT) if $TIMEOUT;
1256 ### run passive if specified ###
1257 #push @$cmd, '-p' if $FTP_PASSIVE;
1258 local $ENV{'FTP_PASSIVE_MODE'} = 1 if $FTP_PASSIVE;
1260 ### set the output document, add the uri ###
1261 push @$cmd, '-o', $to, $self->uri;
1263 ### with IPC::Cmd > 0.41, this is fixed in teh library,
1264 ### and there's no need for special casing any more.
1265 ### DO NOT quote things for IPC::Run, it breaks stuff.
1266 # $IPC::Cmd::USE_IPC_RUN
1267 # ? ($to, $self->uri)
1268 # : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE);
1272 unless(run( command => $cmd,
1273 buffer => \$captured,
1276 ### wget creates the output document always, even if the fetch
1277 ### fails.. so unlink it in that case
1280 return $self->_error(loc( "Command failed: %1", $captured || '' ));
1286 ### use File::Copy for fetching file:// urls ###
1288 ### See section 3.10 of RFC 1738 (http://www.faqs.org/rfcs/rfc1738.html)
1289 ### Also see wikipedia on file:// (http://en.wikipedia.org/wiki/File://)
1298 to => { required => 1, store => \$to }
1300 check( $tmpl, \%hash ) or return;
1304 ### prefix a / on unix systems with a file uri, since it would
1305 ### look somewhat like this:
1306 ### file:///home/kane/file
1307 ### whereas windows file uris for 'c:\some\dir\file' might look like:
1308 ### file:///C:/some/dir/file
1309 ### file:///C|/some/dir/file
1310 ### or for a network share '\\host\share\some\dir\file':
1311 ### file:////host/share/some/dir/file
1313 ### VMS file uri's for 'DISK$USER:[MY.NOTES]NOTE123456.TXT' might look like:
1314 ### file://vms.host.edu/disk$user/my/notes/note12345.txt
1317 my $path = $self->path;
1318 my $vol = $self->vol;
1319 my $share = $self->share;
1322 if (!$share and $self->host) {
1323 return $self->_error(loc(
1324 "Currently %1 cannot handle hosts in %2 urls",
1325 'File::Fetch', 'file://'
1330 $path = File::Spec->catdir( split /\//, $path );
1331 $remote = File::Spec->catpath( $vol, $path, $self->file);
1334 ### win32 specific, and a share name, so we wont bother with File::Spec
1336 $remote = "\\\\".$self->host."\\$share\\$path";
1339 ### File::Spec on VMS can not currently handle UNIX syntax.
1340 my $file_class = ON_VMS
1341 ? 'File::Spec::Unix'
1344 $remote = $file_class->catfile( $path, $self->file );
1347 ### File::Copy is littered with 'die' statements :( ###
1348 my $rv = eval { File::Copy::copy( $remote, $to ) };
1350 ### something went wrong ###
1352 return $self->_error(loc("Could not copy '%1' to '%2': %3 %4",
1353 $remote, $to, $!, $@));
1359 ### use /usr/bin/rsync to fetch files
1366 to => { required => 1, store => \$to }
1368 check( $tmpl, \%hash ) or return;
1370 unless ( $rsync = can_run('rsync') ) {
1371 $METHOD_FAIL->{'rsync'} = 1;
1375 my $cmd = [ $rsync ];
1377 ### XXX: rsync has no I/O timeouts at all, by default
1378 push(@$cmd, '--timeout=' . $TIMEOUT) if $TIMEOUT;
1380 push(@$cmd, '--quiet') unless $DEBUG;
1382 ### DO NOT quote things for IPC::Run, it breaks stuff.
1383 push @$cmd, $self->uri, $to;
1385 ### with IPC::Cmd > 0.41, this is fixed in teh library,
1386 ### and there's no need for special casing any more.
1387 ### DO NOT quote things for IPC::Run, it breaks stuff.
1388 # $IPC::Cmd::USE_IPC_RUN
1389 # ? ($to, $self->uri)
1390 # : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE);
1393 unless(run( command => $cmd,
1394 buffer => \$captured,
1398 return $self->_error(loc("Command %1 failed: %2",
1399 "@$cmd" || '', $captured || ''));
1406 ### use git to fetch files
1413 to => { required => 1, store => \$to }
1415 check( $tmpl, \%hash ) or return;
1417 unless ( $git = can_run('git') ) {
1418 $METHOD_FAIL->{'git'} = 1;
1422 my $cmd = [ $git, 'clone' ];
1424 #push(@$cmd, '--timeout=' . $TIMEOUT) if $TIMEOUT;
1426 push(@$cmd, '--quiet') unless $DEBUG;
1428 ### DO NOT quote things for IPC::Run, it breaks stuff.
1429 push @$cmd, $self->uri, $to;
1431 ### with IPC::Cmd > 0.41, this is fixed in teh library,
1432 ### and there's no need for special casing any more.
1433 ### DO NOT quote things for IPC::Run, it breaks stuff.
1434 # $IPC::Cmd::USE_IPC_RUN
1435 # ? ($to, $self->uri)
1436 # : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE);
1439 unless(run( command => $cmd,
1440 buffer => \$captured,
1444 return $self->_error(loc("Command %1 failed: %2",
1445 "@$cmd" || '', $captured || ''));
1452 #################################
1456 #################################
1460 =head2 $ff->error([BOOL])
1462 Returns the last encountered error as string.
1463 Pass it a true value to get the C<Carp::longmess()> output instead.
1467 ### error handling the way Archive::Extract does it
1472 $self->_error_msg( $error );
1473 $self->_error_msg_long( Carp::longmess($error) );
1476 carp $DEBUG ? $self->_error_msg_long : $self->_error_msg;
1484 return shift() ? $self->_error_msg_long : $self->_error_msg;
1494 File::Fetch is able to fetch a variety of uris, by using several
1495 external programs and modules.
1497 Below is a mapping of what utilities will be used in what order
1498 for what schemes, if available:
1500 file => LWP, lftp, file
1501 http => LWP, HTTP::Lite, wget, curl, lftp, fetch, lynx, iosock
1502 ftp => LWP, Net::FTP, wget, curl, lftp, fetch, ncftp, ftp
1506 If you'd like to disable the use of one or more of these utilities
1507 and/or modules, see the C<$BLACKLIST> variable further down.
1509 If a utility or module isn't available, it will be marked in a cache
1510 (see the C<$METHOD_FAIL> variable further down), so it will not be
1511 tried again. The C<fetch> method will only fail when all options are
1512 exhausted, and it was not able to retrieve the file.
1514 The C<fetch> utility is available on FreeBSD. NetBSD and Dragonfly BSD
1515 may also have it from C<pkgsrc>. We only check for C<fetch> on those
1518 C<iosock> is a very limited L<IO::Socket::INET> based mechanism for
1519 retrieving C<http> schemed urls. It doesn't follow redirects for instance.
1521 C<git> only supports C<git://> style urls.
1523 A special note about fetching files from an ftp uri:
1525 By default, all ftp connections are done in passive mode. To change
1526 that, see the C<$FTP_PASSIVE> variable further down.
1528 Furthermore, ftp uris only support anonymous connections, so no
1529 named user/password pair can be passed along.
1531 C</bin/ftp> is blacklisted by default; see the C<$BLACKLIST> variable
1534 =head1 GLOBAL VARIABLES
1536 The behaviour of File::Fetch can be altered by changing the following
1539 =head2 $File::Fetch::FROM_EMAIL
1541 This is the email address that will be sent as your anonymous ftp
1544 Default is C<File-Fetch@example.com>.
1546 =head2 $File::Fetch::USER_AGENT
1548 This is the useragent as C<LWP> will report it.
1550 Default is C<File::Fetch/$VERSION>.
1552 =head2 $File::Fetch::FTP_PASSIVE
1554 This variable controls whether the environment variable C<FTP_PASSIVE>
1555 and any passive switches to commandline tools will be set to true.
1559 Note: When $FTP_PASSIVE is true, C<ncftp> will not be used to fetch
1560 files, since passive mode can only be set interactively for this binary
1562 =head2 $File::Fetch::TIMEOUT
1564 When set, controls the network timeout (counted in seconds).
1568 =head2 $File::Fetch::WARN
1570 This variable controls whether errors encountered internally by
1571 C<File::Fetch> should be C<carp>'d or not.
1573 Set to false to silence warnings. Inspect the output of the C<error()>
1574 method manually to see what went wrong.
1576 Defaults to C<true>.
1578 =head2 $File::Fetch::DEBUG
1580 This enables debugging output when calling commandline utilities to
1582 This also enables C<Carp::longmess> errors, instead of the regular
1585 Good for tracking down why things don't work with your particular
1590 =head2 $File::Fetch::BLACKLIST
1592 This is an array ref holding blacklisted modules/utilities for fetching
1595 To disallow the use of, for example, C<LWP> and C<Net::FTP>, you could
1596 set $File::Fetch::BLACKLIST to:
1598 $File::Fetch::BLACKLIST = [qw|lwp netftp|]
1600 The default blacklist is [qw|ftp|], as C</bin/ftp> is rather unreliable.
1602 See the note on C<MAPPING> below.
1604 =head2 $File::Fetch::METHOD_FAIL
1606 This is a hashref registering what modules/utilities were known to fail
1607 for fetching files (mostly because they weren't installed).
1609 You can reset this cache by assigning an empty hashref to it, or
1610 individually remove keys.
1612 See the note on C<MAPPING> below.
1617 Here's a quick mapping for the utilities/modules, and their names for
1618 the $BLACKLIST, $METHOD_FAIL and other internal functions.
1621 HTTP::Lite => httplite
1622 HTTP::Tiny => httptiny
1632 IO::Socket => iosock
1634 =head1 FREQUENTLY ASKED QUESTIONS
1636 =head2 So how do I use a proxy with File::Fetch?
1638 C<File::Fetch> currently only supports proxies with LWP::UserAgent.
1639 You will need to set your environment variables accordingly. For
1640 example, to use an ftp proxy:
1642 $ENV{ftp_proxy} = 'foo.com';
1644 Refer to the LWP::UserAgent manpage for more details.
1646 =head2 I used 'lynx' to fetch a file, but its contents is all wrong!
1648 C<lynx> can only fetch remote files by dumping its contents to C<STDOUT>,
1649 which we in turn capture. If that content is a 'custom' error file
1650 (like, say, a C<404 handler>), you will get that contents instead.
1652 Sadly, C<lynx> doesn't support any options to return a different exit
1653 code on non-C<200 OK> status, giving us no way to tell the difference
1654 between a 'successful' fetch and a custom error page.
1656 Therefor, we recommend to only use C<lynx> as a last resort. This is
1657 why it is at the back of our list of methods to try as well.
1659 =head2 Files I'm trying to fetch have reserved characters or non-ASCII characters in them. What do I do?
1661 C<File::Fetch> is relatively smart about things. When trying to write
1662 a file to disk, it removes the C<query parameters> (see the
1663 C<output_file> method for details) from the file name before creating
1664 it. In most cases this suffices.
1666 If you have any other characters you need to escape, please install
1667 the C<URI::Escape> module from CPAN, and pre-encode your URI before
1668 passing it to C<File::Fetch>. You can read about the details of URIs
1669 and URI encoding here:
1671 http://www.faqs.org/rfcs/rfc2396.html
1677 =item Implement $PREFER_BIN
1679 To indicate to rather use commandline tools than modules
1685 Please report bugs or other issues to E<lt>bug-file-fetch@rt.cpan.org<gt>.
1689 This module by Jos Boumans E<lt>kane@cpan.orgE<gt>.
1693 This library is free software; you may redistribute and/or modify it
1694 under the same terms as Perl itself.
1700 # c-indentation-style: bsd
1702 # indent-tabs-mode: nil
1704 # vim: expandtab shiftwidth=4: