This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Remove AT&T UWIN support
[perl5.git] / Porting / checkURL.pl
CommitLineData
23c4d79e 1#!perl
6e7dc4a9 2use strict;
23c4d79e
LB
3use warnings;
4use autodie;
5use feature qw(say);
d4b5f70a
NC
6require File::Find::Rule;
7require File::Slurp;
8require File::Spec;
9require IO::Socket::SSL;
23c4d79e 10use List::Util qw(sum);
d4b5f70a
NC
11require LWP::UserAgent;
12require Net::FTP;
13require Parallel::Fork::BossWorkerAsync;
14require Term::ProgressBar::Simple;
15require URI::Find::Simple;
23c4d79e
LB
16$| = 1;
17
18my %ignore;
19while ( my $line = <main::DATA> ) {
20 chomp $line;
21 next if $line =~ /^#/;
22 next unless $line;
23 $ignore{$line} = 1;
24}
6e7dc4a9 25
7c509e8a 26my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
23c4d79e
LB
27$ua->timeout(58);
28$ua->env_proxy;
6e7dc4a9 29
23c4d79e
LB
30my @filenames = @ARGV;
31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32 unless @filenames;
33
34my $total_bytes = sum map {-s} @filenames;
35
36my $extract_progress = Term::ProgressBar::Simple->new(
37 { count => $total_bytes,
38 name => 'Extracting URIs',
39 }
40);
6e7dc4a9 41
23c4d79e
LB
42my %uris;
43foreach my $filename (@filenames) {
44 next if $filename =~ /uris\.txt/;
45 next if $filename =~ /check_uris/;
46 next if $filename =~ /\.patch$/;
7c509e8a
LB
47 next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48 next if $filename =~ /checkURL\.pl$/;
d4b5f70a
NC
49 my $contents = File::Slurp::read_file($filename);
50 my @uris = URI::Find::Simple::list_uris($contents);
23c4d79e
LB
51 foreach my $uri (@uris) {
52 next unless $uri =~ /^(http|ftp)/;
53 next if $ignore{$uri};
6e7dc4a9 54
23c4d79e
LB
55 # no need to hit rt.perl.org
56 next
3ef6a87a 57 if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
6e7dc4a9 58
b4285b0d
DB
59 # no need to hit github
60 next
61 if $uri =~ m{^https?://(?:www\.)?github\.com/[pP]erl/perl5/issues/\d+$};
62
23c4d79e
LB
63 # no need to hit rt.cpan.org
64 next
7c509e8a
LB
65 if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
66
67 # no need to hit google groups (weird redirect LWP does not like)
68 next
69 if $uri =~ m{^http://groups\.google\.com/};
70
23c4d79e
LB
71 push @{ $uris{$uri} }, $filename;
72 }
73 $extract_progress += -s $filename;
74}
75
76my $bw = Parallel::Fork::BossWorkerAsync->new(
77 work_handler => \&work_alarmed,
78 global_timeout => 120,
79 worker_count => 20,
80);
81
82foreach my $uri ( keys %uris ) {
83 my @filenames = @{ $uris{$uri} };
84 $bw->add_work( { uri => $uri, filenames => \@filenames } );
85}
86
87undef $extract_progress;
88
89my $fetch_progress = Term::ProgressBar::Simple->new(
90 { count => scalar( keys %uris ),
91 name => 'Fetching URIs',
92 }
93);
94
95my %filenames;
96while ( $bw->pending() ) {
97 my $response = $bw->get_result();
98 my $uri = $response->{uri};
99 my @filenames = @{ $response->{filenames} };
100 my $is_success = $response->{is_success};
101 my $message = $response->{message};
102
103 unless ($is_success) {
104 foreach my $filename (@filenames) {
105 push @{ $filenames{$filename} },
106 { uri => $uri, message => $message };
6e7dc4a9
JH
107 }
108 }
23c4d79e 109 $fetch_progress++;
6e7dc4a9 110}
23c4d79e 111$bw->shut_down();
6e7dc4a9 112
23c4d79e
LB
113my $fh = IO::File->new('> uris.txt');
114foreach my $filename ( sort keys %filenames ) {
115 $fh->say("* $filename");
116 my @bits = @{ $filenames{$filename} };
117 foreach my $bit (@bits) {
118 my $uri = $bit->{uri};
119 my $message = $bit->{message};
120 $fh->say(" $uri");
121 $fh->say(" $message");
0d6d7233
JH
122 }
123}
23c4d79e 124$fh->close;
0d6d7233 125
23c4d79e 126say 'Finished, see uris.txt';
6e7dc4a9 127
23c4d79e
LB
128sub work_alarmed {
129 my $conf = shift;
130 eval {
131 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
132 alarm 60;
133 $conf = work($conf);
134 alarm 0;
135 };
136 if ($@) {
137 $conf->{is_success} = 0;
138 $conf->{message} = 'Timed out';
0d6d7233 139
23c4d79e
LB
140 }
141 return $conf;
0d6d7233
JH
142}
143
23c4d79e
LB
144sub work {
145 my $conf = shift;
146 my $uri = $conf->{uri};
147 my @filenames = @{ $conf->{filenames} };
0d6d7233 148
23c4d79e
LB
149 if ( $uri =~ /^http/ ) {
150 my $uri_without_fragment = URI->new($uri);
151 my $fragment = $uri_without_fragment->fragment(undef);
152 my $response = $ua->head($uri_without_fragment);
0d6d7233 153
23c4d79e
LB
154 $conf->{is_success} = $response->is_success;
155 $conf->{message} = $response->status_line;
156 return $conf;
157 } else {
0d6d7233 158
23c4d79e
LB
159 my $uri_object = URI->new($uri);
160 my $host = $uri_object->host;
161 my $path = $uri_object->path;
162 my ( $volume, $directories, $filename )
163 = File::Spec->splitpath($path);
0d6d7233 164
23c4d79e
LB
165 my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
166 unless ($ftp) {
167 $conf->{is_succcess} = 0;
168 $conf->{message} = "Can not connect to $host: $@";
169 return $conf;
170 }
0d6d7233 171
23c4d79e
LB
172 my $can_login = $ftp->login( "anonymous", '-anonymous@' );
173 unless ($can_login) {
174 $conf->{is_success} = 0;
175 $conf->{message} = "Can not login ", $ftp->message;
176 return $conf;
6e7dc4a9
JH
177 }
178
23c4d79e
LB
179 my $can_binary = $ftp->binary();
180 unless ($can_binary) {
181 $conf->{is_success} = 0;
182 $conf->{message} = "Can not binary ", $ftp->message;
183 return $conf;
184 }
185
186 my $can_cwd = $ftp->cwd($directories);
187 unless ($can_cwd) {
188 $conf->{is_success} = 0;
189 $conf->{message} = "Can not cwd to $directories ", $ftp->message;
190 return $conf;
191 }
192
193 if ($filename) {
194 my $can_size = $ftp->size($filename);
195 unless ($can_size) {
196 $conf->{is_success} = 0;
197 $conf->{message}
198 = "Can not size $filename in $directories",
199 $ftp->message;
200 return $conf;
201 }
202 } else {
203 my ($can_dir) = $ftp->dir;
204 unless ($can_dir) {
205 my ($can_ls) = $ftp->ls;
206 unless ($can_ls) {
207 $conf->{is_success} = 0;
208 $conf->{message}
209 = "Can not dir or ls in $directories ",
210 $ftp->message;
211 return $conf;
212 }
213 }
214 }
215
216 $conf->{is_success} = 1;
217 return $conf;
6e7dc4a9
JH
218 }
219}
220
23c4d79e
LB
221__DATA__
222# these are fine but give errors
223ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
224ftp://ftp.stratus.com/pub/vos/utility/utility.html
225
7c509e8a 226# these are missing, sigh
23c4d79e
LB
227ftp://ftp.sco.com/SLS/ptf7051e.Z
228http://perlmonks.thepen.com/42898.html
7c509e8a
LB
229http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
230http://public.activestate.com/cgi-bin/perlbrowse
231http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
232http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
233http://my.smithmicro.com/mac/stuffit/
234http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
235http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
05558683
MM
236http://www.openzaurus.org/
237http://Casbah.org/
238http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe
239http://www.pvhp.com/~pvhp/
240http://www.pvhp.com/%7Epvhp/
241http://www.pvhp.com/%7epvhp/
242http://www.leo.org
243http://www.madgoat.com
244http://www.mks.com/s390/gnu/
05558683
MM
245http://www.tpj.com/
246http://safaribooksonline.com/
247http://use.perl.org/~autrijus/journal/25768
248http://www.s390.ibm.com/products/oe/bpxqp1.html
249http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01396.html
250http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01489.html
251http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01491.html
252http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01608.html
253http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02144.html
254http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02998.html
255http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1999-03/msg00520.html
256http://www.w3.org/Security/Faq/
7c509e8a
LB
257
258# these are URI extraction bugs
23c4d79e
LB
259http://www.perl.org/E
260http://en.wikipedia.org/wiki/SREC_(file_format
261http://somewhere.else',-type=/
262ftp:passive-mode
263ftp:
264http:[-
265http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
266http://www.xray.mpe.mpg.de/mailing-lists/perl5-
267http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
7c509e8a
LB
268http://perl.come/
269http://www.perl.come/
23c4d79e
LB
270
271# these are used as an example
272http://example.com/
273http://something.here/
274http://users.perl5.git.perl.org/~yourlogin/
275http://github.com/USERNAME/perl/tree/orange
276http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
277http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
278http://somewhere.else$/
279http://somewhere.else$/
280http://somewhere.else/bin/foo&bar',-Type=
281http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
282http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
283http://www.perl.org/test.cgi
284http://cpan2.local/
285http://search.cpan.org/perldoc?
286http://cpan1.local/
287http://cpan.dev.local/CPAN
288http:///
289ftp://
290ftp://myurl/
291ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
292http://www14.software.ibm.com/webapp/download/downloadaz.jsp
293http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
294http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
295http://localhost/tmp/index.txt
296http://example.com/foo/bar.html
297http://example.com/Text-Bastardize-1.06.tar.gz
298ftp://example.com/sources/packages.txt
299http://example.com/sources/packages.txt
300http://example.com/sources
301ftp://example.com/sources
302http://some.where.com/dir/file.txt
303http://some.where.com/dir/a.txt
304http://foo.com/X.tgz
305ftp://foo.com/X.tgz
306http://foo/
307http://www.foo.com:8000/
308http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
309http://decoded/mirror/path
310http://a/b/c/d/e/f/g/h/i/j
311http://foo/bar.gz
312ftp://ftp.perl.org
313http://purl.org/rss/1.0/modules/taxonomy/
314ftp://ftp.sun.ac.za/CPAN/CPAN/
315ftp://ftp.cpan.org/pub/mirror/index.txt
316ftp://cpan.org/pub/mirror/index.txt
317http://example.com/~eh/
318http://plagger.org/.../rss
319http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
320http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
321http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
322http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
323http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
324http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
325http://module-build.sourceforge.net/META-spec-new.html
326http://module-build.sourceforge.net/META-spec-v1.4.html
327http://www.cs.vu.nl/~tmgil/vi.html
328http://perlcomposer.sourceforge.net/vperl.html
329http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
330http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
331http://world.std.com/~aep/ptkdb/
332http://www.castlelink.co.uk/object_system/
333http://www.fh-wedel.de/elvis/
334ftp://ftp.blarg.net/users/amol/zsh/
335ftp://ftp.funet.fi/pub/languages/perl/CPAN
336http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
7c509e8a
LB
337http://users.perl5.git.perl.org/~USERNAME
338http://foo/x//y/script.cgi/a//b
339http://xxx/script.cgi/http://foo
340http://foo/./x//z/script.cgi/a/../b//c
341http://somewhere.else/in/movie/land
342http://somewhere.else/finished.html
343http://somewhere.else/bin/foo&bar$
344http://somewhere.else/
345http://proxy:8484/
346http://proxy/
347http://myrepo.example.com/
348http://remote/source
349https://example.com/
350http://example.com:1024/
351http:///path?foo=bar
352http://[::]:1024/
353http://([/
354http://example.com:9000/index.html
355http://proxy.example.com:8080/
356http:///index.html
357http://[www.json::pp.org]/
358http://localhost/
359http://foo.example.com/
360http://abc.com/a.js
361http://whatever/man/1/crontab
362http://abc.com/c.js
363http://whatever/Foo%3A%3ABar
364http://abc.com/b.js
365http://remote.server.com/jquery.css
366http://some.other.com/page.html
367https://text.com/1/2
368https://text.com/1/2
369http://link.included.here?o=1&p=2
370http://link.included.here?o=1&amp;p=2
371http://link.included.here?o=1&amp;p=2
372http://link.included.here/
373http://foo/x//y/script.cgi/a//b
374http://xxx/script.cgi/http://foo
375http://foo/./x//z/script.cgi/a/../b//c
376http://somewhere.else/in/movie/land
377http://somewhere.else/finished.html
378http://webproxy:3128/
379http://www/
23c4d79e
LB
380
381# these are used to generate or match URLs
382http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
383http://www.cpantesters.org/show/%s.yaml
384ftp://(.*?)/(.*)/(.*
385ftp://(.*?)/(.*)/(.*
386ftp://(.*?)/(.*)/(.*
387ftp://ftp.foo.bar/
388http://$host/
389http://wwwe%3C46/
390ftp:/
7c509e8a
LB
391http://$addr/mark?commit=$
392http://search.cpan.org/~
393http:/
394ftp:%5Cn$url
395http://www.ietf.org/rfc/rfc$2.txt
396http://search.cpan.org/~
397ftp:%5Cn$url
23c4d79e
LB
398
399# weird redirects that LWP doesn't like
400http://www.theperlreview.com/community_calendar
401http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
7c509e8a 402http://sunsolve.sun.com
23c4d79e
LB
403
404# broken webserver that doesn't like HEAD requests
405http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
7c509e8a 406http://www.w3.org/TR/html4/loose.dtd
23c4d79e
LB
407
408# these have been reported upstream to CPAN authors
409http://www.gnu.org/manual/tar/html_node/tar_139.html
410http://www.w3.org/pub/WWW/TR/Wd-css-1.html
411http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
412http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
413http://search.cpan.org/search?query=Module::Build::Convert
414http://www.refcnt.org/papers/module-build-convert
415http://csrc.nist.gov/cryptval/shs.html
416http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
417http://www.debian.or.jp/~kubota/unicode-symbols.html.en
418http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
419http://www.debian.or.jp/~kubota/unicode-symbols.html.en
420http://rfc.net/rfc2781.html
421http://www.icu-project.org/charset/
422http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
423http://www.rfc-editor.org/
424http://www.rfc.net/
425http://www.oreilly.com/people/authors/lunde/cjk_inf.html
426http://www.oreilly.com/catalog/cjkvinfo/
427http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
428http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
429http://www.egt.ie/standards/iso3166/iso3166-1-en.html
430http://www.bsi-global.com/iso4217currency
431http://www.plover.com/~mjd/perl/Memoize/
432http://www.plover.com/~mjd/perl/MiniMemoize/
433http://www.sysadminmag.com/tpj/issues/vol5_5/
434ftp://ftp.tpc.int/tpc/server/UNIX/
435http://www.nara.gov/genealogy/
436http://home.utah-inter.net/kinsearch/Soundex.html
437http://www.nara.gov/genealogy/soundex/soundex.html
438http://rfc.net/rfc3461.html
439ftp://ftp.cs.pdx.edu/pub/elvis/
440http://www.fh-wedel.de/elvis/
7d0fb9b8 441http://lists.perl.org/list/perl-mvs.html
7c509e8a
LB
442http://www.cpan.org/ports/os2/
443http://github.com/dagolden/cpan-meta-spec
444http://github.com/dagolden/cpan-meta-spec/issues
445http://www.opensource.org/licenses/lgpl-license.phpt
446http://reality.sgi.com/ariel
447http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
448http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
449http://www.nsrl.nist.gov/testdata/
450http://public.activestate.com/cgi-bin/perlbrowse/p/31194
451http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
452http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
453http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
454http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
455http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
456http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
457http://github.com/schwern/extutils-makemaker
458https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
459http://www.json.org/JSON::PP_checker/
460ftp://ftp.kiae.su/pub/unix/fido/
461http://www.gallistel.net/nparker/weather/code/
462http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
463ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
464http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
465http://public.activestate.com/cgi-bin/perlbrowse/p/33567
466http://public.activestate.com/cgi-bin/perlbrowse/p/33566
467http://www.dsmit.com/cons/
468http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
23c4d79e 469
6e7dc4a9 470__END__
23c4d79e
LB
471
472=head1 NAME
473
474checkURL.pl - Check that all the URLs in the Perl source are valid
475
476=head1 DESCRIPTION
477
478This program checks that all the URLs in the Perl source are valid. It
479checks HTTP and FTP links in parallel and contains a list of known
480bad example links in its source. It takes 4 minutes to run on my
481machine. The results are written to 'uris.txt' and list the filename,
482the URL and the error:
483
484 * ext/Locale-Maketext/lib/Locale/Maketext.pod
485 http://sunsite.dk/RFC/rfc/rfc2277.html
486 404 Not Found
487 ...
488
489It should be run every so often and links fixed and upstream authors
490notified.
491
492Note that the web is unstable and some websites are temporarily down.