This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Import dead URLs from my last analysis
[perl5.git] / Porting / checkURL.pl
1 #!perl
2 use strict;
3 use warnings;
4 use autodie;
5 use feature qw(say);
6 require File::Find::Rule;
7 require File::Slurp;
8 require File::Spec;
9 require IO::Socket::SSL;
10 use List::Util qw(sum);
11 require LWP::UserAgent;
12 require Net::FTP;
13 require Parallel::Fork::BossWorkerAsync;
14 require Term::ProgressBar::Simple;
15 require URI::Find::Simple;
16 $| = 1;
17
18 my %ignore;
19 while ( my $line = <main::DATA> ) {
20     chomp $line;
21     next if $line =~ /^#/;
22     next unless $line;
23     $ignore{$line} = 1;
24 }
25
26 my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
27 $ua->timeout(58);
28 $ua->env_proxy;
29
30 my @filenames = @ARGV;
31 @filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32     unless @filenames;
33
34 my $total_bytes = sum map {-s} @filenames;
35
36 my $extract_progress = Term::ProgressBar::Simple->new(
37     {   count => $total_bytes,
38         name  => 'Extracting URIs',
39     }
40 );
41
42 my %uris;
43 foreach my $filename (@filenames) {
44     next if $filename =~ /uris\.txt/;
45     next if $filename =~ /check_uris/;
46     next if $filename =~ /\.patch$/;
47     next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48     next if $filename =~ /checkURL\.pl$/;
49     my $contents = File::Slurp::read_file($filename);
50     my @uris     = URI::Find::Simple::list_uris($contents);
51     foreach my $uri (@uris) {
52         next unless $uri =~ /^(http|ftp)/;
53         next if $ignore{$uri};
54
55         # no need to hit rt.perl.org
56         next
57             if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
58
59         # no need to hit rt.cpan.org
60         next
61             if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
62
63         # no need to hit google groups (weird redirect LWP does not like)
64         next
65             if $uri =~ m{^http://groups\.google\.com/};
66
67         push @{ $uris{$uri} }, $filename;
68     }
69     $extract_progress += -s $filename;
70 }
71
72 my $bw = Parallel::Fork::BossWorkerAsync->new(
73     work_handler   => \&work_alarmed,
74     global_timeout => 120,
75     worker_count   => 20,
76 );
77
78 foreach my $uri ( keys %uris ) {
79     my @filenames = @{ $uris{$uri} };
80     $bw->add_work( { uri => $uri, filenames => \@filenames } );
81 }
82
83 undef $extract_progress;
84
85 my $fetch_progress = Term::ProgressBar::Simple->new(
86     {   count => scalar( keys %uris ),
87         name  => 'Fetching URIs',
88     }
89 );
90
91 my %filenames;
92 while ( $bw->pending() ) {
93     my $response   = $bw->get_result();
94     my $uri        = $response->{uri};
95     my @filenames  = @{ $response->{filenames} };
96     my $is_success = $response->{is_success};
97     my $message    = $response->{message};
98
99     unless ($is_success) {
100         foreach my $filename (@filenames) {
101             push @{ $filenames{$filename} },
102                 { uri => $uri, message => $message };
103         }
104     }
105     $fetch_progress++;
106 }
107 $bw->shut_down();
108
109 my $fh = IO::File->new('> uris.txt');
110 foreach my $filename ( sort keys %filenames ) {
111     $fh->say("* $filename");
112     my @bits = @{ $filenames{$filename} };
113     foreach my $bit (@bits) {
114         my $uri     = $bit->{uri};
115         my $message = $bit->{message};
116         $fh->say("  $uri");
117         $fh->say("    $message");
118     }
119 }
120 $fh->close;
121
122 say 'Finished, see uris.txt';
123
124 sub work_alarmed {
125     my $conf = shift;
126     eval {
127         local $SIG{ALRM} = sub { die "alarm\n" };    # NB: \n required
128         alarm 60;
129         $conf = work($conf);
130         alarm 0;
131     };
132     if ($@) {
133         $conf->{is_success} = 0;
134         $conf->{message}    = 'Timed out';
135
136     }
137     return $conf;
138 }
139
140 sub work {
141     my $conf      = shift;
142     my $uri       = $conf->{uri};
143     my @filenames = @{ $conf->{filenames} };
144
145     if ( $uri =~ /^http/ ) {
146         my $uri_without_fragment = URI->new($uri);
147         my $fragment             = $uri_without_fragment->fragment(undef);
148         my $response             = $ua->head($uri_without_fragment);
149
150         $conf->{is_success} = $response->is_success;
151         $conf->{message}    = $response->status_line;
152         return $conf;
153     } else {
154
155         my $uri_object = URI->new($uri);
156         my $host       = $uri_object->host;
157         my $path       = $uri_object->path;
158         my ( $volume, $directories, $filename )
159             = File::Spec->splitpath($path);
160
161         my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
162         unless ($ftp) {
163             $conf->{is_succcess} = 0;
164             $conf->{message}     = "Can not connect to $host: $@";
165             return $conf;
166         }
167
168         my $can_login = $ftp->login( "anonymous", '-anonymous@' );
169         unless ($can_login) {
170             $conf->{is_success} = 0;
171             $conf->{message} = "Can not login ", $ftp->message;
172             return $conf;
173         }
174
175         my $can_binary = $ftp->binary();
176         unless ($can_binary) {
177             $conf->{is_success} = 0;
178             $conf->{message} = "Can not binary ", $ftp->message;
179             return $conf;
180         }
181
182         my $can_cwd = $ftp->cwd($directories);
183         unless ($can_cwd) {
184             $conf->{is_success} = 0;
185             $conf->{message} = "Can not cwd to $directories ", $ftp->message;
186             return $conf;
187         }
188
189         if ($filename) {
190             my $can_size = $ftp->size($filename);
191             unless ($can_size) {
192                 $conf->{is_success} = 0;
193                 $conf->{message}
194                     = "Can not size $filename in $directories",
195                     $ftp->message;
196                 return $conf;
197             }
198         } else {
199             my ($can_dir) = $ftp->dir;
200             unless ($can_dir) {
201                 my ($can_ls) = $ftp->ls;
202                 unless ($can_ls) {
203                     $conf->{is_success} = 0;
204                     $conf->{message}
205                         = "Can not dir or ls in $directories ",
206                         $ftp->message;
207                     return $conf;
208                 }
209             }
210         }
211
212         $conf->{is_success} = 1;
213         return $conf;
214     }
215 }
216
217 __DATA__
218 # these are fine but give errors
219 ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
220 ftp://ftp.stratus.com/pub/vos/utility/utility.html
221
222 # these are missing, sigh
223 ftp://ftp.sco.com/SLS/ptf7051e.Z
224 http://perlmonks.thepen.com/42898.html
225 http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
226 http://public.activestate.com/cgi-bin/perlbrowse
227 http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
228 http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
229 http://my.smithmicro.com/mac/stuffit/
230 http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
231 http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
232 http://www.openzaurus.org/
233 http://Casbah.org/
234 http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe
235 http://www.pvhp.com/~pvhp/
236 http://www.pvhp.com/%7Epvhp/
237 http://www.pvhp.com/%7epvhp/
238 http://www.leo.org
239 http://www.madgoat.com
240 http://www.mks.com/s390/gnu/
241 http://www.research.att.com/sw/tools/uwin/
242 http://www.tpj.com/
243 http://safaribooksonline.com/
244 http://use.perl.org/~autrijus/journal/25768
245 http://www.s390.ibm.com/products/oe/bpxqp1.html
246 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01396.html
247 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01489.html
248 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01491.html
249 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01608.html
250 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02144.html
251 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02998.html
252 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1999-03/msg00520.html
253 http://www.w3.org/Security/Faq/
254
255 # these are URI extraction bugs
256 http://www.perl.org/E
257 http://en.wikipedia.org/wiki/SREC_(file_format
258 http://somewhere.else',-type=/
259 ftp:passive-mode
260 ftp:
261 http:[-
262 http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
263 http://www.xray.mpe.mpg.de/mailing-lists/perl5-
264 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
265 http://perl.come/
266 http://www.perl.come/
267
268 # these are used as an example
269 http://example.com/
270 http://something.here/
271 http://users.perl5.git.perl.org/~yourlogin/
272 http://github.com/USERNAME/perl/tree/orange
273 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
274 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
275 http://somewhere.else$/
276 http://somewhere.else$/
277 http://somewhere.else/bin/foo&bar',-Type=
278 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
279 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
280 http://www.perl.org/test.cgi
281 http://cpan2.local/
282 http://search.cpan.org/perldoc?
283 http://cpan1.local/
284 http://cpan.dev.local/CPAN
285 http:///
286 ftp://
287 ftp://myurl/
288 ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
289 http://www14.software.ibm.com/webapp/download/downloadaz.jsp
290 http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
291 http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
292 http://localhost/tmp/index.txt
293 http://example.com/foo/bar.html
294 http://example.com/Text-Bastardize-1.06.tar.gz
295 ftp://example.com/sources/packages.txt
296 http://example.com/sources/packages.txt
297 http://example.com/sources
298 ftp://example.com/sources
299 http://some.where.com/dir/file.txt
300 http://some.where.com/dir/a.txt
301 http://foo.com/X.tgz
302 ftp://foo.com/X.tgz
303 http://foo/
304 http://www.foo.com:8000/
305 http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
306 http://decoded/mirror/path
307 http://a/b/c/d/e/f/g/h/i/j
308 http://foo/bar.gz
309 ftp://ftp.perl.org
310 http://purl.org/rss/1.0/modules/taxonomy/
311 ftp://ftp.sun.ac.za/CPAN/CPAN/
312 ftp://ftp.cpan.org/pub/mirror/index.txt
313 ftp://cpan.org/pub/mirror/index.txt
314 http://example.com/~eh/
315 http://plagger.org/.../rss
316 http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
317 http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
318 http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
319 http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
320 http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
321 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip  
322 http://module-build.sourceforge.net/META-spec-new.html
323 http://module-build.sourceforge.net/META-spec-v1.4.html
324 http://www.cs.vu.nl/~tmgil/vi.html
325 http://perlcomposer.sourceforge.net/vperl.html
326 http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
327 http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
328 http://world.std.com/~aep/ptkdb/
329 http://www.castlelink.co.uk/object_system/
330 http://www.fh-wedel.de/elvis/
331 ftp://ftp.blarg.net/users/amol/zsh/
332 ftp://ftp.funet.fi/pub/languages/perl/CPAN
333 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
334 http://users.perl5.git.perl.org/~USERNAME
335 http://foo/x//y/script.cgi/a//b
336 http://xxx/script.cgi/http://foo
337 http://foo/./x//z/script.cgi/a/../b//c
338 http://somewhere.else/in/movie/land
339 http://somewhere.else/finished.html
340 http://somewhere.else/bin/foo&bar$
341 http://somewhere.else/
342 http://proxy:8484/
343 http://proxy/
344 http://myrepo.example.com/
345 http://remote/source
346 https://example.com/
347 http://example.com:1024/
348 http:///path?foo=bar
349 http://[::]:1024/
350 http://([/
351 http://example.com:9000/index.html
352 http://proxy.example.com:8080/
353 http:///index.html
354 http://[www.json::pp.org]/
355 http://localhost/
356 http://foo.example.com/
357 http://abc.com/a.js
358 http://whatever/man/1/crontab
359 http://abc.com/c.js
360 http://whatever/Foo%3A%3ABar
361 http://abc.com/b.js
362 http://remote.server.com/jquery.css
363 http://some.other.com/page.html
364 https://text.com/1/2
365 https://text.com/1/2
366 http://link.included.here?o=1&p=2
367 http://link.included.here?o=1&amp;p=2
368 http://link.included.here?o=1&amp;p=2
369 http://link.included.here/
370 http://foo/x//y/script.cgi/a//b
371 http://xxx/script.cgi/http://foo
372 http://foo/./x//z/script.cgi/a/../b//c
373 http://somewhere.else/in/movie/land
374 http://somewhere.else/finished.html
375 http://webproxy:3128/
376 http://www/
377
378 # these are used to generate or match URLs
379 http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
380 http://www.cpantesters.org/show/%s.yaml
381 ftp://(.*?)/(.*)/(.*
382 ftp://(.*?)/(.*)/(.*
383 ftp://(.*?)/(.*)/(.*
384 ftp://ftp.foo.bar/
385 http://$host/
386 http://wwwe%3C46/
387 ftp:/
388 http://$addr/mark?commit=$
389 http://search.cpan.org/~
390 http:/
391 ftp:%5Cn$url
392 http://www.ietf.org/rfc/rfc$2.txt
393 http://search.cpan.org/~
394 ftp:%5Cn$url
395
396 # weird redirects that LWP doesn't like
397 http://www.theperlreview.com/community_calendar
398 http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
399 http://sunsolve.sun.com
400
401 # broken webserver that doesn't like HEAD requests
402 http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
403 http://www.w3.org/TR/html4/loose.dtd
404
405 # these have been reported upstream to CPAN authors
406 http://www.gnu.org/manual/tar/html_node/tar_139.html
407 http://www.w3.org/pub/WWW/TR/Wd-css-1.html
408 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
409 http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
410 http://search.cpan.org/search?query=Module::Build::Convert
411 http://www.refcnt.org/papers/module-build-convert
412 http://csrc.nist.gov/cryptval/shs.html
413 http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
414 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
415 http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
416 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
417 http://rfc.net/rfc2781.html
418 http://www.icu-project.org/charset/
419 http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
420 http://www.rfc-editor.org/
421 http://www.rfc.net/
422 http://www.oreilly.com/people/authors/lunde/cjk_inf.html
423 http://www.oreilly.com/catalog/cjkvinfo/
424 http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
425 http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
426 http://www.egt.ie/standards/iso3166/iso3166-1-en.html
427 http://www.bsi-global.com/iso4217currency
428 http://www.plover.com/~mjd/perl/Memoize/
429 http://www.plover.com/~mjd/perl/MiniMemoize/
430 http://www.sysadminmag.com/tpj/issues/vol5_5/
431 ftp://ftp.tpc.int/tpc/server/UNIX/
432 http://www.nara.gov/genealogy/
433 http://home.utah-inter.net/kinsearch/Soundex.html
434 http://www.nara.gov/genealogy/soundex/soundex.html
435 http://rfc.net/rfc3461.html
436 ftp://ftp.cs.pdx.edu/pub/elvis/
437 http://www.fh-wedel.de/elvis/
438 http://lists.perl.org/list/perl-mvs.html
439 http://www.cpan.org/ports/os2/
440 http://github.com/dagolden/cpan-meta-spec
441 http://github.com/dagolden/cpan-meta-spec/issues
442 http://www.opensource.org/licenses/lgpl-license.phpt
443 http://reality.sgi.com/ariel
444 http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
445 http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
446 http://www.nsrl.nist.gov/testdata/
447 http://public.activestate.com/cgi-bin/perlbrowse/p/31194
448 http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
449 http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
450 http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
451 http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
452 http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
453 http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
454 http://github.com/schwern/extutils-makemaker
455 https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
456 http://www.json.org/JSON::PP_checker/
457 ftp://ftp.kiae.su/pub/unix/fido/
458 http://www.gallistel.net/nparker/weather/code/
459 http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
460 ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
461 http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
462 http://public.activestate.com/cgi-bin/perlbrowse/p/33567
463 http://public.activestate.com/cgi-bin/perlbrowse/p/33566
464 http://www.dsmit.com/cons/
465 http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
466
467 __END__
468
469 =head1 NAME
470
471 checkURL.pl - Check that all the URLs in the Perl source are valid
472
473 =head1 DESCRIPTION
474
475 This program checks that all the URLs in the Perl source are valid. It
476 checks HTTP and FTP links in parallel and contains a list of known
477 bad example links in its source. It takes 4 minutes to run on my
478 machine. The results are written to 'uris.txt' and list the filename,
479 the URL and the error:
480
481   * ext/Locale-Maketext/lib/Locale/Maketext.pod
482     http://sunsite.dk/RFC/rfc/rfc2277.html
483       404 Not Found
484   ...
485
486 It should be run every so often and links fixed and upstream authors
487 notified.
488
489 Note that the web is unstable and some websites are temporarily down.