This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Import dead URLs from my last analysis
[perl5.git] / Porting / checkURL.pl
CommitLineData
23c4d79e 1#!perl
6e7dc4a9 2use strict;
23c4d79e
LB
3use warnings;
4use autodie;
5use feature qw(say);
d4b5f70a
NC
6require File::Find::Rule;
7require File::Slurp;
8require File::Spec;
9require IO::Socket::SSL;
23c4d79e 10use List::Util qw(sum);
d4b5f70a
NC
11require LWP::UserAgent;
12require Net::FTP;
13require Parallel::Fork::BossWorkerAsync;
14require Term::ProgressBar::Simple;
15require URI::Find::Simple;
23c4d79e
LB
16$| = 1;
17
18my %ignore;
19while ( my $line = <main::DATA> ) {
20 chomp $line;
21 next if $line =~ /^#/;
22 next unless $line;
23 $ignore{$line} = 1;
24}
6e7dc4a9 25
7c509e8a 26my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
23c4d79e
LB
27$ua->timeout(58);
28$ua->env_proxy;
6e7dc4a9 29
23c4d79e
LB
30my @filenames = @ARGV;
31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32 unless @filenames;
33
34my $total_bytes = sum map {-s} @filenames;
35
36my $extract_progress = Term::ProgressBar::Simple->new(
37 { count => $total_bytes,
38 name => 'Extracting URIs',
39 }
40);
6e7dc4a9 41
23c4d79e
LB
42my %uris;
43foreach my $filename (@filenames) {
44 next if $filename =~ /uris\.txt/;
45 next if $filename =~ /check_uris/;
46 next if $filename =~ /\.patch$/;
7c509e8a
LB
47 next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48 next if $filename =~ /checkURL\.pl$/;
d4b5f70a
NC
49 my $contents = File::Slurp::read_file($filename);
50 my @uris = URI::Find::Simple::list_uris($contents);
23c4d79e
LB
51 foreach my $uri (@uris) {
52 next unless $uri =~ /^(http|ftp)/;
53 next if $ignore{$uri};
6e7dc4a9 54
23c4d79e
LB
55 # no need to hit rt.perl.org
56 next
3ef6a87a 57 if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
6e7dc4a9 58
23c4d79e
LB
59 # no need to hit rt.cpan.org
60 next
7c509e8a
LB
61 if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
62
63 # no need to hit google groups (weird redirect LWP does not like)
64 next
65 if $uri =~ m{^http://groups\.google\.com/};
66
23c4d79e
LB
67 push @{ $uris{$uri} }, $filename;
68 }
69 $extract_progress += -s $filename;
70}
71
72my $bw = Parallel::Fork::BossWorkerAsync->new(
73 work_handler => \&work_alarmed,
74 global_timeout => 120,
75 worker_count => 20,
76);
77
78foreach my $uri ( keys %uris ) {
79 my @filenames = @{ $uris{$uri} };
80 $bw->add_work( { uri => $uri, filenames => \@filenames } );
81}
82
83undef $extract_progress;
84
85my $fetch_progress = Term::ProgressBar::Simple->new(
86 { count => scalar( keys %uris ),
87 name => 'Fetching URIs',
88 }
89);
90
91my %filenames;
92while ( $bw->pending() ) {
93 my $response = $bw->get_result();
94 my $uri = $response->{uri};
95 my @filenames = @{ $response->{filenames} };
96 my $is_success = $response->{is_success};
97 my $message = $response->{message};
98
99 unless ($is_success) {
100 foreach my $filename (@filenames) {
101 push @{ $filenames{$filename} },
102 { uri => $uri, message => $message };
6e7dc4a9
JH
103 }
104 }
23c4d79e 105 $fetch_progress++;
6e7dc4a9 106}
23c4d79e 107$bw->shut_down();
6e7dc4a9 108
23c4d79e
LB
109my $fh = IO::File->new('> uris.txt');
110foreach my $filename ( sort keys %filenames ) {
111 $fh->say("* $filename");
112 my @bits = @{ $filenames{$filename} };
113 foreach my $bit (@bits) {
114 my $uri = $bit->{uri};
115 my $message = $bit->{message};
116 $fh->say(" $uri");
117 $fh->say(" $message");
0d6d7233
JH
118 }
119}
23c4d79e 120$fh->close;
0d6d7233 121
23c4d79e 122say 'Finished, see uris.txt';
6e7dc4a9 123
23c4d79e
LB
124sub work_alarmed {
125 my $conf = shift;
126 eval {
127 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
128 alarm 60;
129 $conf = work($conf);
130 alarm 0;
131 };
132 if ($@) {
133 $conf->{is_success} = 0;
134 $conf->{message} = 'Timed out';
0d6d7233 135
23c4d79e
LB
136 }
137 return $conf;
0d6d7233
JH
138}
139
23c4d79e
LB
140sub work {
141 my $conf = shift;
142 my $uri = $conf->{uri};
143 my @filenames = @{ $conf->{filenames} };
0d6d7233 144
23c4d79e
LB
145 if ( $uri =~ /^http/ ) {
146 my $uri_without_fragment = URI->new($uri);
147 my $fragment = $uri_without_fragment->fragment(undef);
148 my $response = $ua->head($uri_without_fragment);
0d6d7233 149
23c4d79e
LB
150 $conf->{is_success} = $response->is_success;
151 $conf->{message} = $response->status_line;
152 return $conf;
153 } else {
0d6d7233 154
23c4d79e
LB
155 my $uri_object = URI->new($uri);
156 my $host = $uri_object->host;
157 my $path = $uri_object->path;
158 my ( $volume, $directories, $filename )
159 = File::Spec->splitpath($path);
0d6d7233 160
23c4d79e
LB
161 my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
162 unless ($ftp) {
163 $conf->{is_succcess} = 0;
164 $conf->{message} = "Can not connect to $host: $@";
165 return $conf;
166 }
0d6d7233 167
23c4d79e
LB
168 my $can_login = $ftp->login( "anonymous", '-anonymous@' );
169 unless ($can_login) {
170 $conf->{is_success} = 0;
171 $conf->{message} = "Can not login ", $ftp->message;
172 return $conf;
6e7dc4a9
JH
173 }
174
23c4d79e
LB
175 my $can_binary = $ftp->binary();
176 unless ($can_binary) {
177 $conf->{is_success} = 0;
178 $conf->{message} = "Can not binary ", $ftp->message;
179 return $conf;
180 }
181
182 my $can_cwd = $ftp->cwd($directories);
183 unless ($can_cwd) {
184 $conf->{is_success} = 0;
185 $conf->{message} = "Can not cwd to $directories ", $ftp->message;
186 return $conf;
187 }
188
189 if ($filename) {
190 my $can_size = $ftp->size($filename);
191 unless ($can_size) {
192 $conf->{is_success} = 0;
193 $conf->{message}
194 = "Can not size $filename in $directories",
195 $ftp->message;
196 return $conf;
197 }
198 } else {
199 my ($can_dir) = $ftp->dir;
200 unless ($can_dir) {
201 my ($can_ls) = $ftp->ls;
202 unless ($can_ls) {
203 $conf->{is_success} = 0;
204 $conf->{message}
205 = "Can not dir or ls in $directories ",
206 $ftp->message;
207 return $conf;
208 }
209 }
210 }
211
212 $conf->{is_success} = 1;
213 return $conf;
6e7dc4a9
JH
214 }
215}
216
23c4d79e
LB
217__DATA__
218# these are fine but give errors
219ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
220ftp://ftp.stratus.com/pub/vos/utility/utility.html
221
7c509e8a 222# these are missing, sigh
23c4d79e
LB
223ftp://ftp.sco.com/SLS/ptf7051e.Z
224http://perlmonks.thepen.com/42898.html
7c509e8a
LB
225http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
226http://public.activestate.com/cgi-bin/perlbrowse
227http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
228http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
229http://my.smithmicro.com/mac/stuffit/
230http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
231http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
05558683
MM
232http://www.openzaurus.org/
233http://Casbah.org/
234http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe
235http://www.pvhp.com/~pvhp/
236http://www.pvhp.com/%7Epvhp/
237http://www.pvhp.com/%7epvhp/
238http://www.leo.org
239http://www.madgoat.com
240http://www.mks.com/s390/gnu/
241http://www.research.att.com/sw/tools/uwin/
242http://www.tpj.com/
243http://safaribooksonline.com/
244http://use.perl.org/~autrijus/journal/25768
245http://www.s390.ibm.com/products/oe/bpxqp1.html
246http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01396.html
247http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01489.html
248http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01491.html
249http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01608.html
250http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02144.html
251http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02998.html
252http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1999-03/msg00520.html
253http://www.w3.org/Security/Faq/
7c509e8a
LB
254
255# these are URI extraction bugs
23c4d79e
LB
256http://www.perl.org/E
257http://en.wikipedia.org/wiki/SREC_(file_format
258http://somewhere.else',-type=/
259ftp:passive-mode
260ftp:
261http:[-
262http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
263http://www.xray.mpe.mpg.de/mailing-lists/perl5-
264http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
7c509e8a
LB
265http://perl.come/
266http://www.perl.come/
23c4d79e
LB
267
268# these are used as an example
269http://example.com/
270http://something.here/
271http://users.perl5.git.perl.org/~yourlogin/
272http://github.com/USERNAME/perl/tree/orange
273http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
274http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
275http://somewhere.else$/
276http://somewhere.else$/
277http://somewhere.else/bin/foo&bar',-Type=
278http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
279http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
280http://www.perl.org/test.cgi
281http://cpan2.local/
282http://search.cpan.org/perldoc?
283http://cpan1.local/
284http://cpan.dev.local/CPAN
285http:///
286ftp://
287ftp://myurl/
288ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
289http://www14.software.ibm.com/webapp/download/downloadaz.jsp
290http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
291http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
292http://localhost/tmp/index.txt
293http://example.com/foo/bar.html
294http://example.com/Text-Bastardize-1.06.tar.gz
295ftp://example.com/sources/packages.txt
296http://example.com/sources/packages.txt
297http://example.com/sources
298ftp://example.com/sources
299http://some.where.com/dir/file.txt
300http://some.where.com/dir/a.txt
301http://foo.com/X.tgz
302ftp://foo.com/X.tgz
303http://foo/
304http://www.foo.com:8000/
305http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
306http://decoded/mirror/path
307http://a/b/c/d/e/f/g/h/i/j
308http://foo/bar.gz
309ftp://ftp.perl.org
310http://purl.org/rss/1.0/modules/taxonomy/
311ftp://ftp.sun.ac.za/CPAN/CPAN/
312ftp://ftp.cpan.org/pub/mirror/index.txt
313ftp://cpan.org/pub/mirror/index.txt
314http://example.com/~eh/
315http://plagger.org/.../rss
316http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
317http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
318http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
319http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
320http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
321http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
322http://module-build.sourceforge.net/META-spec-new.html
323http://module-build.sourceforge.net/META-spec-v1.4.html
324http://www.cs.vu.nl/~tmgil/vi.html
325http://perlcomposer.sourceforge.net/vperl.html
326http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
327http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
328http://world.std.com/~aep/ptkdb/
329http://www.castlelink.co.uk/object_system/
330http://www.fh-wedel.de/elvis/
331ftp://ftp.blarg.net/users/amol/zsh/
332ftp://ftp.funet.fi/pub/languages/perl/CPAN
333http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
7c509e8a
LB
334http://users.perl5.git.perl.org/~USERNAME
335http://foo/x//y/script.cgi/a//b
336http://xxx/script.cgi/http://foo
337http://foo/./x//z/script.cgi/a/../b//c
338http://somewhere.else/in/movie/land
339http://somewhere.else/finished.html
340http://somewhere.else/bin/foo&bar$
341http://somewhere.else/
342http://proxy:8484/
343http://proxy/
344http://myrepo.example.com/
345http://remote/source
346https://example.com/
347http://example.com:1024/
348http:///path?foo=bar
349http://[::]:1024/
350http://([/
351http://example.com:9000/index.html
352http://proxy.example.com:8080/
353http:///index.html
354http://[www.json::pp.org]/
355http://localhost/
356http://foo.example.com/
357http://abc.com/a.js
358http://whatever/man/1/crontab
359http://abc.com/c.js
360http://whatever/Foo%3A%3ABar
361http://abc.com/b.js
362http://remote.server.com/jquery.css
363http://some.other.com/page.html
364https://text.com/1/2
365https://text.com/1/2
366http://link.included.here?o=1&p=2
367http://link.included.here?o=1&amp;p=2
368http://link.included.here?o=1&amp;p=2
369http://link.included.here/
370http://foo/x//y/script.cgi/a//b
371http://xxx/script.cgi/http://foo
372http://foo/./x//z/script.cgi/a/../b//c
373http://somewhere.else/in/movie/land
374http://somewhere.else/finished.html
375http://webproxy:3128/
376http://www/
23c4d79e
LB
377
378# these are used to generate or match URLs
379http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
380http://www.cpantesters.org/show/%s.yaml
381ftp://(.*?)/(.*)/(.*
382ftp://(.*?)/(.*)/(.*
383ftp://(.*?)/(.*)/(.*
384ftp://ftp.foo.bar/
385http://$host/
386http://wwwe%3C46/
387ftp:/
7c509e8a
LB
388http://$addr/mark?commit=$
389http://search.cpan.org/~
390http:/
391ftp:%5Cn$url
392http://www.ietf.org/rfc/rfc$2.txt
393http://search.cpan.org/~
394ftp:%5Cn$url
23c4d79e
LB
395
396# weird redirects that LWP doesn't like
397http://www.theperlreview.com/community_calendar
398http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
7c509e8a 399http://sunsolve.sun.com
23c4d79e
LB
400
401# broken webserver that doesn't like HEAD requests
402http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
7c509e8a 403http://www.w3.org/TR/html4/loose.dtd
23c4d79e
LB
404
405# these have been reported upstream to CPAN authors
406http://www.gnu.org/manual/tar/html_node/tar_139.html
407http://www.w3.org/pub/WWW/TR/Wd-css-1.html
408http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
409http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
410http://search.cpan.org/search?query=Module::Build::Convert
411http://www.refcnt.org/papers/module-build-convert
412http://csrc.nist.gov/cryptval/shs.html
413http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
414http://www.debian.or.jp/~kubota/unicode-symbols.html.en
415http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
416http://www.debian.or.jp/~kubota/unicode-symbols.html.en
417http://rfc.net/rfc2781.html
418http://www.icu-project.org/charset/
419http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
420http://www.rfc-editor.org/
421http://www.rfc.net/
422http://www.oreilly.com/people/authors/lunde/cjk_inf.html
423http://www.oreilly.com/catalog/cjkvinfo/
424http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
425http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
426http://www.egt.ie/standards/iso3166/iso3166-1-en.html
427http://www.bsi-global.com/iso4217currency
428http://www.plover.com/~mjd/perl/Memoize/
429http://www.plover.com/~mjd/perl/MiniMemoize/
430http://www.sysadminmag.com/tpj/issues/vol5_5/
431ftp://ftp.tpc.int/tpc/server/UNIX/
432http://www.nara.gov/genealogy/
433http://home.utah-inter.net/kinsearch/Soundex.html
434http://www.nara.gov/genealogy/soundex/soundex.html
435http://rfc.net/rfc3461.html
436ftp://ftp.cs.pdx.edu/pub/elvis/
437http://www.fh-wedel.de/elvis/
7d0fb9b8 438http://lists.perl.org/list/perl-mvs.html
7c509e8a
LB
439http://www.cpan.org/ports/os2/
440http://github.com/dagolden/cpan-meta-spec
441http://github.com/dagolden/cpan-meta-spec/issues
442http://www.opensource.org/licenses/lgpl-license.phpt
443http://reality.sgi.com/ariel
444http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
445http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
446http://www.nsrl.nist.gov/testdata/
447http://public.activestate.com/cgi-bin/perlbrowse/p/31194
448http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
449http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
450http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
451http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
452http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
453http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
454http://github.com/schwern/extutils-makemaker
455https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
456http://www.json.org/JSON::PP_checker/
457ftp://ftp.kiae.su/pub/unix/fido/
458http://www.gallistel.net/nparker/weather/code/
459http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
460ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
461http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
462http://public.activestate.com/cgi-bin/perlbrowse/p/33567
463http://public.activestate.com/cgi-bin/perlbrowse/p/33566
464http://www.dsmit.com/cons/
465http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
23c4d79e 466
6e7dc4a9 467__END__
23c4d79e
LB
468
469=head1 NAME
470
471checkURL.pl - Check that all the URLs in the Perl source are valid
472
473=head1 DESCRIPTION
474
475This program checks that all the URLs in the Perl source are valid. It
476checks HTTP and FTP links in parallel and contains a list of known
477bad example links in its source. It takes 4 minutes to run on my
478machine. The results are written to 'uris.txt' and list the filename,
479the URL and the error:
480
481 * ext/Locale-Maketext/lib/Locale/Maketext.pod
482 http://sunsite.dk/RFC/rfc/rfc2277.html
483 404 Not Found
484 ...
485
486It should be run every so often and links fixed and upstream authors
487notified.
488
489Note that the web is unstable and some websites are temporarily down.