This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Update to check all URLs in the source, including ftp and ignore known example URLs
[perl5.git] / Porting / checkURL.pl
CommitLineData
23c4d79e 1#!perl
6e7dc4a9 2use strict;
23c4d79e
LB
3use warnings;
4use autodie;
5use feature qw(say);
6use File::Find::Rule;
7use File::Slurp;
8use File::Spec;
9use IO::Socket::SSL;
10use List::Util qw(sum);
11use LWP::UserAgent;
12use Net::FTP;
13use Parallel::Fork::BossWorkerAsync;
14use Term::ProgressBar::Simple;
15use URI::Find::Simple qw( list_uris );
16$| = 1;
17
18my %ignore;
19while ( my $line = <main::DATA> ) {
20 chomp $line;
21 next if $line =~ /^#/;
22 next unless $line;
23 $ignore{$line} = 1;
24}
6e7dc4a9 25
23c4d79e
LB
26my $ua = LWP::UserAgent->new;
27$ua->timeout(58);
28$ua->env_proxy;
6e7dc4a9 29
23c4d79e
LB
30my @filenames = @ARGV;
31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32 unless @filenames;
33
34my $total_bytes = sum map {-s} @filenames;
35
36my $extract_progress = Term::ProgressBar::Simple->new(
37 { count => $total_bytes,
38 name => 'Extracting URIs',
39 }
40);
6e7dc4a9 41
23c4d79e
LB
42my %uris;
43foreach my $filename (@filenames) {
44 next if $filename =~ /uris\.txt/;
45 next if $filename =~ /check_uris/;
46 next if $filename =~ /\.patch$/;
47 my $contents = read_file($filename);
48 my @uris = list_uris($contents);
49 foreach my $uri (@uris) {
50 next unless $uri =~ /^(http|ftp)/;
51 next if $ignore{$uri};
6e7dc4a9 52
23c4d79e
LB
53 # no need to hit rt.perl.org
54 next
55 if $uri =~ m{^http://rt.perl.org/rt3/Ticket/Display.html?id=\d+$};
6e7dc4a9 56
23c4d79e
LB
57 # no need to hit rt.cpan.org
58 next
59 if $uri =~ m{^http://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
60 push @{ $uris{$uri} }, $filename;
61 }
62 $extract_progress += -s $filename;
63}
64
65my $bw = Parallel::Fork::BossWorkerAsync->new(
66 work_handler => \&work_alarmed,
67 global_timeout => 120,
68 worker_count => 20,
69);
70
71foreach my $uri ( keys %uris ) {
72 my @filenames = @{ $uris{$uri} };
73 $bw->add_work( { uri => $uri, filenames => \@filenames } );
74}
75
76undef $extract_progress;
77
78my $fetch_progress = Term::ProgressBar::Simple->new(
79 { count => scalar( keys %uris ),
80 name => 'Fetching URIs',
81 }
82);
83
84my %filenames;
85while ( $bw->pending() ) {
86 my $response = $bw->get_result();
87 my $uri = $response->{uri};
88 my @filenames = @{ $response->{filenames} };
89 my $is_success = $response->{is_success};
90 my $message = $response->{message};
91
92 unless ($is_success) {
93 foreach my $filename (@filenames) {
94 push @{ $filenames{$filename} },
95 { uri => $uri, message => $message };
6e7dc4a9
JH
96 }
97 }
23c4d79e 98 $fetch_progress++;
6e7dc4a9 99}
23c4d79e 100$bw->shut_down();
6e7dc4a9 101
23c4d79e
LB
102my $fh = IO::File->new('> uris.txt');
103foreach my $filename ( sort keys %filenames ) {
104 $fh->say("* $filename");
105 my @bits = @{ $filenames{$filename} };
106 foreach my $bit (@bits) {
107 my $uri = $bit->{uri};
108 my $message = $bit->{message};
109 $fh->say(" $uri");
110 $fh->say(" $message");
0d6d7233
JH
111 }
112}
23c4d79e 113$fh->close;
0d6d7233 114
23c4d79e 115say 'Finished, see uris.txt';
6e7dc4a9 116
23c4d79e
LB
117sub work_alarmed {
118 my $conf = shift;
119 eval {
120 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
121 alarm 60;
122 $conf = work($conf);
123 alarm 0;
124 };
125 if ($@) {
126 $conf->{is_success} = 0;
127 $conf->{message} = 'Timed out';
0d6d7233 128
23c4d79e
LB
129 }
130 return $conf;
0d6d7233
JH
131}
132
23c4d79e
LB
133sub work {
134 my $conf = shift;
135 my $uri = $conf->{uri};
136 my @filenames = @{ $conf->{filenames} };
0d6d7233 137
23c4d79e
LB
138 if ( $uri =~ /^http/ ) {
139 my $uri_without_fragment = URI->new($uri);
140 my $fragment = $uri_without_fragment->fragment(undef);
141 my $response = $ua->head($uri_without_fragment);
0d6d7233 142
23c4d79e
LB
143 $conf->{is_success} = $response->is_success;
144 $conf->{message} = $response->status_line;
145 return $conf;
146 } else {
0d6d7233 147
23c4d79e
LB
148 my $uri_object = URI->new($uri);
149 my $host = $uri_object->host;
150 my $path = $uri_object->path;
151 my ( $volume, $directories, $filename )
152 = File::Spec->splitpath($path);
0d6d7233 153
23c4d79e
LB
154 my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
155 unless ($ftp) {
156 $conf->{is_succcess} = 0;
157 $conf->{message} = "Can not connect to $host: $@";
158 return $conf;
159 }
0d6d7233 160
23c4d79e
LB
161 my $can_login = $ftp->login( "anonymous", '-anonymous@' );
162 unless ($can_login) {
163 $conf->{is_success} = 0;
164 $conf->{message} = "Can not login ", $ftp->message;
165 return $conf;
6e7dc4a9
JH
166 }
167
23c4d79e
LB
168 my $can_binary = $ftp->binary();
169 unless ($can_binary) {
170 $conf->{is_success} = 0;
171 $conf->{message} = "Can not binary ", $ftp->message;
172 return $conf;
173 }
174
175 my $can_cwd = $ftp->cwd($directories);
176 unless ($can_cwd) {
177 $conf->{is_success} = 0;
178 $conf->{message} = "Can not cwd to $directories ", $ftp->message;
179 return $conf;
180 }
181
182 if ($filename) {
183 my $can_size = $ftp->size($filename);
184 unless ($can_size) {
185 $conf->{is_success} = 0;
186 $conf->{message}
187 = "Can not size $filename in $directories",
188 $ftp->message;
189 return $conf;
190 }
191 } else {
192 my ($can_dir) = $ftp->dir;
193 unless ($can_dir) {
194 my ($can_ls) = $ftp->ls;
195 unless ($can_ls) {
196 $conf->{is_success} = 0;
197 $conf->{message}
198 = "Can not dir or ls in $directories ",
199 $ftp->message;
200 return $conf;
201 }
202 }
203 }
204
205 $conf->{is_success} = 1;
206 return $conf;
6e7dc4a9
JH
207 }
208}
209
23c4d79e
LB
210__DATA__
211# these are fine but give errors
212ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
213ftp://ftp.stratus.com/pub/vos/utility/utility.html
214
215# this is missing, sigh
216ftp://ftp.sco.com/SLS/ptf7051e.Z
217http://perlmonks.thepen.com/42898.html
218
219# this are URI extraction bugs
220http://www.perl.org/E
221http://en.wikipedia.org/wiki/SREC_(file_format
222http://somewhere.else',-type=/
223ftp:passive-mode
224ftp:
225http:[-
226http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
227http://www.xray.mpe.mpg.de/mailing-lists/perl5-
228http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
229
230# these are used as an example
231http://example.com/
232http://something.here/
233http://users.perl5.git.perl.org/~yourlogin/
234http://github.com/USERNAME/perl/tree/orange
235http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
236http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
237http://somewhere.else$/
238http://somewhere.else$/
239http://somewhere.else/bin/foo&bar',-Type=
240http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
241http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
242http://www.perl.org/test.cgi
243http://cpan2.local/
244http://search.cpan.org/perldoc?
245http://cpan1.local/
246http://cpan.dev.local/CPAN
247http:///
248ftp://
249ftp://myurl/
250ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
251http://www14.software.ibm.com/webapp/download/downloadaz.jsp
252http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
253http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
254http://localhost/tmp/index.txt
255http://example.com/foo/bar.html
256http://example.com/Text-Bastardize-1.06.tar.gz
257ftp://example.com/sources/packages.txt
258http://example.com/sources/packages.txt
259http://example.com/sources
260ftp://example.com/sources
261http://some.where.com/dir/file.txt
262http://some.where.com/dir/a.txt
263http://foo.com/X.tgz
264ftp://foo.com/X.tgz
265http://foo/
266http://www.foo.com:8000/
267http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
268http://decoded/mirror/path
269http://a/b/c/d/e/f/g/h/i/j
270http://foo/bar.gz
271ftp://ftp.perl.org
272http://purl.org/rss/1.0/modules/taxonomy/
273ftp://ftp.sun.ac.za/CPAN/CPAN/
274ftp://ftp.cpan.org/pub/mirror/index.txt
275ftp://cpan.org/pub/mirror/index.txt
276http://example.com/~eh/
277http://plagger.org/.../rss
278http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
279http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
280http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
281http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
282http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
283http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
284http://module-build.sourceforge.net/META-spec-new.html
285http://module-build.sourceforge.net/META-spec-v1.4.html
286http://www.cs.vu.nl/~tmgil/vi.html
287http://perlcomposer.sourceforge.net/vperl.html
288http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
289http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
290http://world.std.com/~aep/ptkdb/
291http://www.castlelink.co.uk/object_system/
292http://www.fh-wedel.de/elvis/
293ftp://ftp.blarg.net/users/amol/zsh/
294ftp://ftp.funet.fi/pub/languages/perl/CPAN
295http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
296
297# these are used to generate or match URLs
298http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
299http://www.cpantesters.org/show/%s.yaml
300ftp://(.*?)/(.*)/(.*
301ftp://(.*?)/(.*)/(.*
302ftp://(.*?)/(.*)/(.*
303ftp://ftp.foo.bar/
304http://$host/
305http://wwwe%3C46/
306ftp:/
307
308# weird redirects that LWP doesn't like
309http://www.theperlreview.com/community_calendar
310http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
311http://groups.google.com/
312http://groups.google.com/group/comp.lang.perl.misc/topics
313http://groups.google.co.uk/group/perl.perl5.porters/browse_thread/thread/b4e2b1d88280ddff/630b667a66e3509f?#630b667a66e3509f
314http://groups.google.com/group/comp.sys.sgi.admin/msg/3ad8353bc4ce3cb0
315http://groups.google.com/group/perl.module.build/browse_thread/thread/c8065052b2e0d741
316http://groups.google.com/group/perl.perl5.porters/browse_thread/thread/adaffaaf939b042e/20dafc298df737f0%2320dafc298df737f0?sa=X&oi=groupsr&start=0&num=3
317
318# broken webserver that doesn't like HEAD requests
319http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
320
321# these have been reported upstream to CPAN authors
322http://www.gnu.org/manual/tar/html_node/tar_139.html
323http://www.w3.org/pub/WWW/TR/Wd-css-1.html
324http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
325http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
326http://search.cpan.org/search?query=Module::Build::Convert
327http://www.refcnt.org/papers/module-build-convert
328http://csrc.nist.gov/cryptval/shs.html
329http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
330http://www.debian.or.jp/~kubota/unicode-symbols.html.en
331http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
332http://www.debian.or.jp/~kubota/unicode-symbols.html.en
333http://rfc.net/rfc2781.html
334http://www.icu-project.org/charset/
335http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
336http://www.rfc-editor.org/
337http://www.rfc.net/
338http://www.oreilly.com/people/authors/lunde/cjk_inf.html
339http://www.oreilly.com/catalog/cjkvinfo/
340http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
341http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
342http://www.egt.ie/standards/iso3166/iso3166-1-en.html
343http://www.bsi-global.com/iso4217currency
344http://www.plover.com/~mjd/perl/Memoize/
345http://www.plover.com/~mjd/perl/MiniMemoize/
346http://www.sysadminmag.com/tpj/issues/vol5_5/
347ftp://ftp.tpc.int/tpc/server/UNIX/
348http://www.nara.gov/genealogy/
349http://home.utah-inter.net/kinsearch/Soundex.html
350http://www.nara.gov/genealogy/soundex/soundex.html
351http://rfc.net/rfc3461.html
352ftp://ftp.cs.pdx.edu/pub/elvis/
353http://www.fh-wedel.de/elvis/
354
6e7dc4a9 355__END__
23c4d79e
LB
356
357=head1 NAME
358
359checkURL.pl - Check that all the URLs in the Perl source are valid
360
361=head1 DESCRIPTION
362
363This program checks that all the URLs in the Perl source are valid. It
364checks HTTP and FTP links in parallel and contains a list of known
365bad example links in its source. It takes 4 minutes to run on my
366machine. The results are written to 'uris.txt' and list the filename,
367the URL and the error:
368
369 * ext/Locale-Maketext/lib/Locale/Maketext.pod
370 http://sunsite.dk/RFC/rfc/rfc2277.html
371 404 Not Found
372 ...
373
374It should be run every so often and links fixed and upstream authors
375notified.
376
377Note that the web is unstable and some websites are temporarily down.