This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Porting/checkURL.pl now requires rather than uses all non-core modules.
[perl5.git] / Porting / checkURL.pl
CommitLineData
23c4d79e 1#!perl
6e7dc4a9 2use strict;
23c4d79e
LB
3use warnings;
4use autodie;
5use feature qw(say);
d4b5f70a
NC
6require File::Find::Rule;
7require File::Slurp;
8require File::Spec;
9require IO::Socket::SSL;
23c4d79e 10use List::Util qw(sum);
d4b5f70a
NC
11require LWP::UserAgent;
12require Net::FTP;
13require Parallel::Fork::BossWorkerAsync;
14require Term::ProgressBar::Simple;
15require URI::Find::Simple;
23c4d79e
LB
16$| = 1;
17
18my %ignore;
19while ( my $line = <main::DATA> ) {
20 chomp $line;
21 next if $line =~ /^#/;
22 next unless $line;
23 $ignore{$line} = 1;
24}
6e7dc4a9 25
7c509e8a 26my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
23c4d79e
LB
27$ua->timeout(58);
28$ua->env_proxy;
6e7dc4a9 29
23c4d79e
LB
30my @filenames = @ARGV;
31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32 unless @filenames;
33
34my $total_bytes = sum map {-s} @filenames;
35
36my $extract_progress = Term::ProgressBar::Simple->new(
37 { count => $total_bytes,
38 name => 'Extracting URIs',
39 }
40);
6e7dc4a9 41
23c4d79e
LB
42my %uris;
43foreach my $filename (@filenames) {
44 next if $filename =~ /uris\.txt/;
45 next if $filename =~ /check_uris/;
46 next if $filename =~ /\.patch$/;
7c509e8a
LB
47 next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48 next if $filename =~ /checkURL\.pl$/;
d4b5f70a
NC
49 my $contents = File::Slurp::read_file($filename);
50 my @uris = URI::Find::Simple::list_uris($contents);
23c4d79e
LB
51 foreach my $uri (@uris) {
52 next unless $uri =~ /^(http|ftp)/;
53 next if $ignore{$uri};
6e7dc4a9 54
23c4d79e
LB
55 # no need to hit rt.perl.org
56 next
7c509e8a 57 if $uri =~ m{^https?://rt.perl.org/rt3/Ticket/Display.html?id=\d+$};
6e7dc4a9 58
23c4d79e
LB
59 # no need to hit rt.cpan.org
60 next
7c509e8a
LB
61 if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
62
63 # no need to hit google groups (weird redirect LWP does not like)
64 next
65 if $uri =~ m{^http://groups\.google\.com/};
66
23c4d79e
LB
67 push @{ $uris{$uri} }, $filename;
68 }
69 $extract_progress += -s $filename;
70}
71
72my $bw = Parallel::Fork::BossWorkerAsync->new(
73 work_handler => \&work_alarmed,
74 global_timeout => 120,
75 worker_count => 20,
76);
77
78foreach my $uri ( keys %uris ) {
79 my @filenames = @{ $uris{$uri} };
80 $bw->add_work( { uri => $uri, filenames => \@filenames } );
81}
82
83undef $extract_progress;
84
85my $fetch_progress = Term::ProgressBar::Simple->new(
86 { count => scalar( keys %uris ),
87 name => 'Fetching URIs',
88 }
89);
90
91my %filenames;
92while ( $bw->pending() ) {
93 my $response = $bw->get_result();
94 my $uri = $response->{uri};
95 my @filenames = @{ $response->{filenames} };
96 my $is_success = $response->{is_success};
97 my $message = $response->{message};
98
99 unless ($is_success) {
100 foreach my $filename (@filenames) {
101 push @{ $filenames{$filename} },
102 { uri => $uri, message => $message };
6e7dc4a9
JH
103 }
104 }
23c4d79e 105 $fetch_progress++;
6e7dc4a9 106}
23c4d79e 107$bw->shut_down();
6e7dc4a9 108
23c4d79e
LB
109my $fh = IO::File->new('> uris.txt');
110foreach my $filename ( sort keys %filenames ) {
111 $fh->say("* $filename");
112 my @bits = @{ $filenames{$filename} };
113 foreach my $bit (@bits) {
114 my $uri = $bit->{uri};
115 my $message = $bit->{message};
116 $fh->say(" $uri");
117 $fh->say(" $message");
0d6d7233
JH
118 }
119}
23c4d79e 120$fh->close;
0d6d7233 121
23c4d79e 122say 'Finished, see uris.txt';
6e7dc4a9 123
23c4d79e
LB
124sub work_alarmed {
125 my $conf = shift;
126 eval {
127 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
128 alarm 60;
129 $conf = work($conf);
130 alarm 0;
131 };
132 if ($@) {
133 $conf->{is_success} = 0;
134 $conf->{message} = 'Timed out';
0d6d7233 135
23c4d79e
LB
136 }
137 return $conf;
0d6d7233
JH
138}
139
23c4d79e
LB
140sub work {
141 my $conf = shift;
142 my $uri = $conf->{uri};
143 my @filenames = @{ $conf->{filenames} };
0d6d7233 144
23c4d79e
LB
145 if ( $uri =~ /^http/ ) {
146 my $uri_without_fragment = URI->new($uri);
147 my $fragment = $uri_without_fragment->fragment(undef);
148 my $response = $ua->head($uri_without_fragment);
0d6d7233 149
23c4d79e
LB
150 $conf->{is_success} = $response->is_success;
151 $conf->{message} = $response->status_line;
152 return $conf;
153 } else {
0d6d7233 154
23c4d79e
LB
155 my $uri_object = URI->new($uri);
156 my $host = $uri_object->host;
157 my $path = $uri_object->path;
158 my ( $volume, $directories, $filename )
159 = File::Spec->splitpath($path);
0d6d7233 160
23c4d79e
LB
161 my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
162 unless ($ftp) {
163 $conf->{is_succcess} = 0;
164 $conf->{message} = "Can not connect to $host: $@";
165 return $conf;
166 }
0d6d7233 167
23c4d79e
LB
168 my $can_login = $ftp->login( "anonymous", '-anonymous@' );
169 unless ($can_login) {
170 $conf->{is_success} = 0;
171 $conf->{message} = "Can not login ", $ftp->message;
172 return $conf;
6e7dc4a9
JH
173 }
174
23c4d79e
LB
175 my $can_binary = $ftp->binary();
176 unless ($can_binary) {
177 $conf->{is_success} = 0;
178 $conf->{message} = "Can not binary ", $ftp->message;
179 return $conf;
180 }
181
182 my $can_cwd = $ftp->cwd($directories);
183 unless ($can_cwd) {
184 $conf->{is_success} = 0;
185 $conf->{message} = "Can not cwd to $directories ", $ftp->message;
186 return $conf;
187 }
188
189 if ($filename) {
190 my $can_size = $ftp->size($filename);
191 unless ($can_size) {
192 $conf->{is_success} = 0;
193 $conf->{message}
194 = "Can not size $filename in $directories",
195 $ftp->message;
196 return $conf;
197 }
198 } else {
199 my ($can_dir) = $ftp->dir;
200 unless ($can_dir) {
201 my ($can_ls) = $ftp->ls;
202 unless ($can_ls) {
203 $conf->{is_success} = 0;
204 $conf->{message}
205 = "Can not dir or ls in $directories ",
206 $ftp->message;
207 return $conf;
208 }
209 }
210 }
211
212 $conf->{is_success} = 1;
213 return $conf;
6e7dc4a9
JH
214 }
215}
216
23c4d79e
LB
217__DATA__
218# these are fine but give errors
219ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
220ftp://ftp.stratus.com/pub/vos/utility/utility.html
221
7c509e8a 222# these are missing, sigh
23c4d79e
LB
223ftp://ftp.sco.com/SLS/ptf7051e.Z
224http://perlmonks.thepen.com/42898.html
7c509e8a
LB
225http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
226http://public.activestate.com/cgi-bin/perlbrowse
227http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
228http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
229http://my.smithmicro.com/mac/stuffit/
230http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
231http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
232
233# these are URI extraction bugs
23c4d79e
LB
234http://www.perl.org/E
235http://en.wikipedia.org/wiki/SREC_(file_format
236http://somewhere.else',-type=/
237ftp:passive-mode
238ftp:
239http:[-
240http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
241http://www.xray.mpe.mpg.de/mailing-lists/perl5-
242http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
7c509e8a
LB
243http://perl.come/
244http://www.perl.come/
23c4d79e
LB
245
246# these are used as an example
247http://example.com/
248http://something.here/
249http://users.perl5.git.perl.org/~yourlogin/
250http://github.com/USERNAME/perl/tree/orange
251http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
252http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
253http://somewhere.else$/
254http://somewhere.else$/
255http://somewhere.else/bin/foo&bar',-Type=
256http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
257http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
258http://www.perl.org/test.cgi
259http://cpan2.local/
260http://search.cpan.org/perldoc?
261http://cpan1.local/
262http://cpan.dev.local/CPAN
263http:///
264ftp://
265ftp://myurl/
266ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
267http://www14.software.ibm.com/webapp/download/downloadaz.jsp
268http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
269http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
270http://localhost/tmp/index.txt
271http://example.com/foo/bar.html
272http://example.com/Text-Bastardize-1.06.tar.gz
273ftp://example.com/sources/packages.txt
274http://example.com/sources/packages.txt
275http://example.com/sources
276ftp://example.com/sources
277http://some.where.com/dir/file.txt
278http://some.where.com/dir/a.txt
279http://foo.com/X.tgz
280ftp://foo.com/X.tgz
281http://foo/
282http://www.foo.com:8000/
283http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
284http://decoded/mirror/path
285http://a/b/c/d/e/f/g/h/i/j
286http://foo/bar.gz
287ftp://ftp.perl.org
288http://purl.org/rss/1.0/modules/taxonomy/
289ftp://ftp.sun.ac.za/CPAN/CPAN/
290ftp://ftp.cpan.org/pub/mirror/index.txt
291ftp://cpan.org/pub/mirror/index.txt
292http://example.com/~eh/
293http://plagger.org/.../rss
294http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
295http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
296http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
297http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
298http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
299http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
300http://module-build.sourceforge.net/META-spec-new.html
301http://module-build.sourceforge.net/META-spec-v1.4.html
302http://www.cs.vu.nl/~tmgil/vi.html
303http://perlcomposer.sourceforge.net/vperl.html
304http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
305http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
306http://world.std.com/~aep/ptkdb/
307http://www.castlelink.co.uk/object_system/
308http://www.fh-wedel.de/elvis/
309ftp://ftp.blarg.net/users/amol/zsh/
310ftp://ftp.funet.fi/pub/languages/perl/CPAN
311http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
7c509e8a
LB
312http://users.perl5.git.perl.org/~USERNAME
313http://foo/x//y/script.cgi/a//b
314http://xxx/script.cgi/http://foo
315http://foo/./x//z/script.cgi/a/../b//c
316http://somewhere.else/in/movie/land
317http://somewhere.else/finished.html
318http://somewhere.else/bin/foo&bar$
319http://somewhere.else/
320http://proxy:8484/
321http://proxy/
322http://myrepo.example.com/
323http://remote/source
324https://example.com/
325http://example.com:1024/
326http:///path?foo=bar
327http://[::]:1024/
328http://([/
329http://example.com:9000/index.html
330http://proxy.example.com:8080/
331http:///index.html
332http://[www.json::pp.org]/
333http://localhost/
334http://foo.example.com/
335http://abc.com/a.js
336http://whatever/man/1/crontab
337http://abc.com/c.js
338http://whatever/Foo%3A%3ABar
339http://abc.com/b.js
340http://remote.server.com/jquery.css
341http://some.other.com/page.html
342https://text.com/1/2
343https://text.com/1/2
344http://link.included.here?o=1&p=2
345http://link.included.here?o=1&amp;p=2
346http://link.included.here?o=1&amp;p=2
347http://link.included.here/
348http://foo/x//y/script.cgi/a//b
349http://xxx/script.cgi/http://foo
350http://foo/./x//z/script.cgi/a/../b//c
351http://somewhere.else/in/movie/land
352http://somewhere.else/finished.html
353http://webproxy:3128/
354http://www/
23c4d79e
LB
355
356# these are used to generate or match URLs
357http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
358http://www.cpantesters.org/show/%s.yaml
359ftp://(.*?)/(.*)/(.*
360ftp://(.*?)/(.*)/(.*
361ftp://(.*?)/(.*)/(.*
362ftp://ftp.foo.bar/
363http://$host/
364http://wwwe%3C46/
365ftp:/
7c509e8a
LB
366http://$addr/mark?commit=$
367http://search.cpan.org/~
368http:/
369ftp:%5Cn$url
370http://www.ietf.org/rfc/rfc$2.txt
371http://search.cpan.org/~
372ftp:%5Cn$url
23c4d79e
LB
373
374# weird redirects that LWP doesn't like
375http://www.theperlreview.com/community_calendar
376http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
7c509e8a 377http://sunsolve.sun.com
23c4d79e
LB
378
379# broken webserver that doesn't like HEAD requests
380http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
7c509e8a 381http://www.w3.org/TR/html4/loose.dtd
23c4d79e
LB
382
383# these have been reported upstream to CPAN authors
384http://www.gnu.org/manual/tar/html_node/tar_139.html
385http://www.w3.org/pub/WWW/TR/Wd-css-1.html
386http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
387http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
388http://search.cpan.org/search?query=Module::Build::Convert
389http://www.refcnt.org/papers/module-build-convert
390http://csrc.nist.gov/cryptval/shs.html
391http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
392http://www.debian.or.jp/~kubota/unicode-symbols.html.en
393http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
394http://www.debian.or.jp/~kubota/unicode-symbols.html.en
395http://rfc.net/rfc2781.html
396http://www.icu-project.org/charset/
397http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
398http://www.rfc-editor.org/
399http://www.rfc.net/
400http://www.oreilly.com/people/authors/lunde/cjk_inf.html
401http://www.oreilly.com/catalog/cjkvinfo/
402http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
403http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
404http://www.egt.ie/standards/iso3166/iso3166-1-en.html
405http://www.bsi-global.com/iso4217currency
406http://www.plover.com/~mjd/perl/Memoize/
407http://www.plover.com/~mjd/perl/MiniMemoize/
408http://www.sysadminmag.com/tpj/issues/vol5_5/
409ftp://ftp.tpc.int/tpc/server/UNIX/
410http://www.nara.gov/genealogy/
411http://home.utah-inter.net/kinsearch/Soundex.html
412http://www.nara.gov/genealogy/soundex/soundex.html
413http://rfc.net/rfc3461.html
414ftp://ftp.cs.pdx.edu/pub/elvis/
415http://www.fh-wedel.de/elvis/
7d0fb9b8 416http://lists.perl.org/list/perl-mvs.html
7c509e8a
LB
417http://www.cpan.org/ports/os2/
418http://github.com/dagolden/cpan-meta-spec
419http://github.com/dagolden/cpan-meta-spec/issues
420http://www.opensource.org/licenses/lgpl-license.phpt
421http://reality.sgi.com/ariel
422http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
423http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
424http://www.nsrl.nist.gov/testdata/
425http://public.activestate.com/cgi-bin/perlbrowse/p/31194
426http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
427http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
428http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
429http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
430http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
431http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
432http://github.com/schwern/extutils-makemaker
433https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
434http://www.json.org/JSON::PP_checker/
435ftp://ftp.kiae.su/pub/unix/fido/
436http://www.gallistel.net/nparker/weather/code/
437http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
438ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
439http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
440http://public.activestate.com/cgi-bin/perlbrowse/p/33567
441http://public.activestate.com/cgi-bin/perlbrowse/p/33566
442http://www.dsmit.com/cons/
443http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
23c4d79e 444
6e7dc4a9 445__END__
23c4d79e
LB
446
447=head1 NAME
448
449checkURL.pl - Check that all the URLs in the Perl source are valid
450
451=head1 DESCRIPTION
452
453This program checks that all the URLs in the Perl source are valid. It
454checks HTTP and FTP links in parallel and contains a list of known
455bad example links in its source. It takes 4 minutes to run on my
456machine. The results are written to 'uris.txt' and list the filename,
457the URL and the error:
458
459 * ext/Locale-Maketext/lib/Locale/Maketext.pod
460 http://sunsite.dk/RFC/rfc/rfc2277.html
461 404 Not Found
462 ...
463
464It should be run every so often and links fixed and upstream authors
465notified.
466
467Note that the web is unstable and some websites are temporarily down.