This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
embed.fnc: Mark unlnk as Core only
[perl5.git] / Porting / checkURL.pl
CommitLineData
23c4d79e 1#!perl
6e7dc4a9 2use strict;
23c4d79e
LB
3use warnings;
4use autodie;
5use feature qw(say);
d4b5f70a
NC
6require File::Find::Rule;
7require File::Slurp;
8require File::Spec;
9require IO::Socket::SSL;
23c4d79e 10use List::Util qw(sum);
d4b5f70a
NC
11require LWP::UserAgent;
12require Net::FTP;
13require Parallel::Fork::BossWorkerAsync;
14require Term::ProgressBar::Simple;
15require URI::Find::Simple;
23c4d79e
LB
16$| = 1;
17
18my %ignore;
19while ( my $line = <main::DATA> ) {
20 chomp $line;
21 next if $line =~ /^#/;
22 next unless $line;
23 $ignore{$line} = 1;
24}
6e7dc4a9 25
7c509e8a 26my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
23c4d79e
LB
27$ua->timeout(58);
28$ua->env_proxy;
6e7dc4a9 29
23c4d79e
LB
30my @filenames = @ARGV;
31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32 unless @filenames;
33
34my $total_bytes = sum map {-s} @filenames;
35
36my $extract_progress = Term::ProgressBar::Simple->new(
37 { count => $total_bytes,
38 name => 'Extracting URIs',
39 }
40);
6e7dc4a9 41
23c4d79e
LB
42my %uris;
43foreach my $filename (@filenames) {
44 next if $filename =~ /uris\.txt/;
45 next if $filename =~ /check_uris/;
46 next if $filename =~ /\.patch$/;
7c509e8a
LB
47 next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48 next if $filename =~ /checkURL\.pl$/;
d4b5f70a
NC
49 my $contents = File::Slurp::read_file($filename);
50 my @uris = URI::Find::Simple::list_uris($contents);
23c4d79e
LB
51 foreach my $uri (@uris) {
52 next unless $uri =~ /^(http|ftp)/;
53 next if $ignore{$uri};
6e7dc4a9 54
23c4d79e
LB
55 # no need to hit rt.perl.org
56 next
3ef6a87a 57 if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
6e7dc4a9 58
b4285b0d
DB
59 # no need to hit github
60 next
61 if $uri =~ m{^https?://(?:www\.)?github\.com/[pP]erl/perl5/issues/\d+$};
62
23c4d79e
LB
63 # no need to hit rt.cpan.org
64 next
7c509e8a
LB
65 if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
66
67 # no need to hit google groups (weird redirect LWP does not like)
68 next
69 if $uri =~ m{^http://groups\.google\.com/};
70
23c4d79e
LB
71 push @{ $uris{$uri} }, $filename;
72 }
73 $extract_progress += -s $filename;
74}
75
76my $bw = Parallel::Fork::BossWorkerAsync->new(
77 work_handler => \&work_alarmed,
78 global_timeout => 120,
79 worker_count => 20,
80);
81
82foreach my $uri ( keys %uris ) {
83 my @filenames = @{ $uris{$uri} };
84 $bw->add_work( { uri => $uri, filenames => \@filenames } );
85}
86
87undef $extract_progress;
88
89my $fetch_progress = Term::ProgressBar::Simple->new(
90 { count => scalar( keys %uris ),
91 name => 'Fetching URIs',
92 }
93);
94
95my %filenames;
96while ( $bw->pending() ) {
97 my $response = $bw->get_result();
98 my $uri = $response->{uri};
99 my @filenames = @{ $response->{filenames} };
100 my $is_success = $response->{is_success};
101 my $message = $response->{message};
102
103 unless ($is_success) {
104 foreach my $filename (@filenames) {
105 push @{ $filenames{$filename} },
106 { uri => $uri, message => $message };
6e7dc4a9
JH
107 }
108 }
23c4d79e 109 $fetch_progress++;
6e7dc4a9 110}
23c4d79e 111$bw->shut_down();
6e7dc4a9 112
23c4d79e
LB
113my $fh = IO::File->new('> uris.txt');
114foreach my $filename ( sort keys %filenames ) {
115 $fh->say("* $filename");
116 my @bits = @{ $filenames{$filename} };
117 foreach my $bit (@bits) {
118 my $uri = $bit->{uri};
119 my $message = $bit->{message};
120 $fh->say(" $uri");
121 $fh->say(" $message");
0d6d7233
JH
122 }
123}
23c4d79e 124$fh->close;
0d6d7233 125
23c4d79e 126say 'Finished, see uris.txt';
6e7dc4a9 127
23c4d79e
LB
128sub work_alarmed {
129 my $conf = shift;
130 eval {
131 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
132 alarm 60;
133 $conf = work($conf);
134 alarm 0;
135 };
136 if ($@) {
137 $conf->{is_success} = 0;
138 $conf->{message} = 'Timed out';
0d6d7233 139
23c4d79e
LB
140 }
141 return $conf;
0d6d7233
JH
142}
143
23c4d79e
LB
144sub work {
145 my $conf = shift;
146 my $uri = $conf->{uri};
147 my @filenames = @{ $conf->{filenames} };
0d6d7233 148
23c4d79e
LB
149 if ( $uri =~ /^http/ ) {
150 my $uri_without_fragment = URI->new($uri);
151 my $fragment = $uri_without_fragment->fragment(undef);
152 my $response = $ua->head($uri_without_fragment);
0d6d7233 153
23c4d79e
LB
154 $conf->{is_success} = $response->is_success;
155 $conf->{message} = $response->status_line;
156 return $conf;
157 } else {
0d6d7233 158
23c4d79e
LB
159 my $uri_object = URI->new($uri);
160 my $host = $uri_object->host;
161 my $path = $uri_object->path;
162 my ( $volume, $directories, $filename )
163 = File::Spec->splitpath($path);
0d6d7233 164
23c4d79e
LB
165 my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
166 unless ($ftp) {
167 $conf->{is_succcess} = 0;
168 $conf->{message} = "Can not connect to $host: $@";
169 return $conf;
170 }
0d6d7233 171
23c4d79e
LB
172 my $can_login = $ftp->login( "anonymous", '-anonymous@' );
173 unless ($can_login) {
174 $conf->{is_success} = 0;
175 $conf->{message} = "Can not login ", $ftp->message;
176 return $conf;
6e7dc4a9
JH
177 }
178
23c4d79e
LB
179 my $can_binary = $ftp->binary();
180 unless ($can_binary) {
181 $conf->{is_success} = 0;
182 $conf->{message} = "Can not binary ", $ftp->message;
183 return $conf;
184 }
185
186 my $can_cwd = $ftp->cwd($directories);
187 unless ($can_cwd) {
188 $conf->{is_success} = 0;
189 $conf->{message} = "Can not cwd to $directories ", $ftp->message;
190 return $conf;
191 }
192
193 if ($filename) {
194 my $can_size = $ftp->size($filename);
195 unless ($can_size) {
196 $conf->{is_success} = 0;
197 $conf->{message}
198 = "Can not size $filename in $directories",
199 $ftp->message;
200 return $conf;
201 }
202 } else {
203 my ($can_dir) = $ftp->dir;
204 unless ($can_dir) {
205 my ($can_ls) = $ftp->ls;
206 unless ($can_ls) {
207 $conf->{is_success} = 0;
208 $conf->{message}
209 = "Can not dir or ls in $directories ",
210 $ftp->message;
211 return $conf;
212 }
213 }
214 }
215
216 $conf->{is_success} = 1;
217 return $conf;
6e7dc4a9
JH
218 }
219}
220
23c4d79e
LB
221__DATA__
222# these are fine but give errors
223ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
224ftp://ftp.stratus.com/pub/vos/utility/utility.html
225
7c509e8a 226# these are missing, sigh
23c4d79e
LB
227ftp://ftp.sco.com/SLS/ptf7051e.Z
228http://perlmonks.thepen.com/42898.html
7c509e8a
LB
229http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
230http://public.activestate.com/cgi-bin/perlbrowse
231http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
232http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
233http://my.smithmicro.com/mac/stuffit/
234http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
235http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
05558683
MM
236http://www.openzaurus.org/
237http://Casbah.org/
238http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe
239http://www.pvhp.com/~pvhp/
240http://www.pvhp.com/%7Epvhp/
241http://www.pvhp.com/%7epvhp/
242http://www.leo.org
243http://www.madgoat.com
244http://www.mks.com/s390/gnu/
245http://www.research.att.com/sw/tools/uwin/
246http://www.tpj.com/
247http://safaribooksonline.com/
248http://use.perl.org/~autrijus/journal/25768
249http://www.s390.ibm.com/products/oe/bpxqp1.html
250http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01396.html
251http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01489.html
252http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01491.html
253http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01608.html
254http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02144.html
255http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02998.html
256http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1999-03/msg00520.html
257http://www.w3.org/Security/Faq/
7c509e8a
LB
258
259# these are URI extraction bugs
23c4d79e
LB
260http://www.perl.org/E
261http://en.wikipedia.org/wiki/SREC_(file_format
262http://somewhere.else',-type=/
263ftp:passive-mode
264ftp:
265http:[-
266http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
267http://www.xray.mpe.mpg.de/mailing-lists/perl5-
268http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
7c509e8a
LB
269http://perl.come/
270http://www.perl.come/
23c4d79e
LB
271
272# these are used as an example
273http://example.com/
274http://something.here/
275http://users.perl5.git.perl.org/~yourlogin/
276http://github.com/USERNAME/perl/tree/orange
277http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
278http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
279http://somewhere.else$/
280http://somewhere.else$/
281http://somewhere.else/bin/foo&bar',-Type=
282http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
283http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
284http://www.perl.org/test.cgi
285http://cpan2.local/
286http://search.cpan.org/perldoc?
287http://cpan1.local/
288http://cpan.dev.local/CPAN
289http:///
290ftp://
291ftp://myurl/
292ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
293http://www14.software.ibm.com/webapp/download/downloadaz.jsp
294http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
295http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
296http://localhost/tmp/index.txt
297http://example.com/foo/bar.html
298http://example.com/Text-Bastardize-1.06.tar.gz
299ftp://example.com/sources/packages.txt
300http://example.com/sources/packages.txt
301http://example.com/sources
302ftp://example.com/sources
303http://some.where.com/dir/file.txt
304http://some.where.com/dir/a.txt
305http://foo.com/X.tgz
306ftp://foo.com/X.tgz
307http://foo/
308http://www.foo.com:8000/
309http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
310http://decoded/mirror/path
311http://a/b/c/d/e/f/g/h/i/j
312http://foo/bar.gz
313ftp://ftp.perl.org
314http://purl.org/rss/1.0/modules/taxonomy/
315ftp://ftp.sun.ac.za/CPAN/CPAN/
316ftp://ftp.cpan.org/pub/mirror/index.txt
317ftp://cpan.org/pub/mirror/index.txt
318http://example.com/~eh/
319http://plagger.org/.../rss
320http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
321http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
322http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
323http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
324http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
325http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
326http://module-build.sourceforge.net/META-spec-new.html
327http://module-build.sourceforge.net/META-spec-v1.4.html
328http://www.cs.vu.nl/~tmgil/vi.html
329http://perlcomposer.sourceforge.net/vperl.html
330http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
331http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
332http://world.std.com/~aep/ptkdb/
333http://www.castlelink.co.uk/object_system/
334http://www.fh-wedel.de/elvis/
335ftp://ftp.blarg.net/users/amol/zsh/
336ftp://ftp.funet.fi/pub/languages/perl/CPAN
337http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
7c509e8a
LB
338http://users.perl5.git.perl.org/~USERNAME
339http://foo/x//y/script.cgi/a//b
340http://xxx/script.cgi/http://foo
341http://foo/./x//z/script.cgi/a/../b//c
342http://somewhere.else/in/movie/land
343http://somewhere.else/finished.html
344http://somewhere.else/bin/foo&bar$
345http://somewhere.else/
346http://proxy:8484/
347http://proxy/
348http://myrepo.example.com/
349http://remote/source
350https://example.com/
351http://example.com:1024/
352http:///path?foo=bar
353http://[::]:1024/
354http://([/
355http://example.com:9000/index.html
356http://proxy.example.com:8080/
357http:///index.html
358http://[www.json::pp.org]/
359http://localhost/
360http://foo.example.com/
361http://abc.com/a.js
362http://whatever/man/1/crontab
363http://abc.com/c.js
364http://whatever/Foo%3A%3ABar
365http://abc.com/b.js
366http://remote.server.com/jquery.css
367http://some.other.com/page.html
368https://text.com/1/2
369https://text.com/1/2
370http://link.included.here?o=1&p=2
371http://link.included.here?o=1&amp;p=2
372http://link.included.here?o=1&amp;p=2
373http://link.included.here/
374http://foo/x//y/script.cgi/a//b
375http://xxx/script.cgi/http://foo
376http://foo/./x//z/script.cgi/a/../b//c
377http://somewhere.else/in/movie/land
378http://somewhere.else/finished.html
379http://webproxy:3128/
380http://www/
23c4d79e
LB
381
382# these are used to generate or match URLs
383http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
384http://www.cpantesters.org/show/%s.yaml
385ftp://(.*?)/(.*)/(.*
386ftp://(.*?)/(.*)/(.*
387ftp://(.*?)/(.*)/(.*
388ftp://ftp.foo.bar/
389http://$host/
390http://wwwe%3C46/
391ftp:/
7c509e8a
LB
392http://$addr/mark?commit=$
393http://search.cpan.org/~
394http:/
395ftp:%5Cn$url
396http://www.ietf.org/rfc/rfc$2.txt
397http://search.cpan.org/~
398ftp:%5Cn$url
23c4d79e
LB
399
400# weird redirects that LWP doesn't like
401http://www.theperlreview.com/community_calendar
402http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
7c509e8a 403http://sunsolve.sun.com
23c4d79e
LB
404
405# broken webserver that doesn't like HEAD requests
406http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
7c509e8a 407http://www.w3.org/TR/html4/loose.dtd
23c4d79e
LB
408
409# these have been reported upstream to CPAN authors
410http://www.gnu.org/manual/tar/html_node/tar_139.html
411http://www.w3.org/pub/WWW/TR/Wd-css-1.html
412http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
413http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
414http://search.cpan.org/search?query=Module::Build::Convert
415http://www.refcnt.org/papers/module-build-convert
416http://csrc.nist.gov/cryptval/shs.html
417http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
418http://www.debian.or.jp/~kubota/unicode-symbols.html.en
419http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
420http://www.debian.or.jp/~kubota/unicode-symbols.html.en
421http://rfc.net/rfc2781.html
422http://www.icu-project.org/charset/
423http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
424http://www.rfc-editor.org/
425http://www.rfc.net/
426http://www.oreilly.com/people/authors/lunde/cjk_inf.html
427http://www.oreilly.com/catalog/cjkvinfo/
428http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
429http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
430http://www.egt.ie/standards/iso3166/iso3166-1-en.html
431http://www.bsi-global.com/iso4217currency
432http://www.plover.com/~mjd/perl/Memoize/
433http://www.plover.com/~mjd/perl/MiniMemoize/
434http://www.sysadminmag.com/tpj/issues/vol5_5/
435ftp://ftp.tpc.int/tpc/server/UNIX/
436http://www.nara.gov/genealogy/
437http://home.utah-inter.net/kinsearch/Soundex.html
438http://www.nara.gov/genealogy/soundex/soundex.html
439http://rfc.net/rfc3461.html
440ftp://ftp.cs.pdx.edu/pub/elvis/
441http://www.fh-wedel.de/elvis/
7d0fb9b8 442http://lists.perl.org/list/perl-mvs.html
7c509e8a
LB
443http://www.cpan.org/ports/os2/
444http://github.com/dagolden/cpan-meta-spec
445http://github.com/dagolden/cpan-meta-spec/issues
446http://www.opensource.org/licenses/lgpl-license.phpt
447http://reality.sgi.com/ariel
448http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
449http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
450http://www.nsrl.nist.gov/testdata/
451http://public.activestate.com/cgi-bin/perlbrowse/p/31194
452http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
453http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
454http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
455http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
456http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
457http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
458http://github.com/schwern/extutils-makemaker
459https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
460http://www.json.org/JSON::PP_checker/
461ftp://ftp.kiae.su/pub/unix/fido/
462http://www.gallistel.net/nparker/weather/code/
463http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
464ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
465http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
466http://public.activestate.com/cgi-bin/perlbrowse/p/33567
467http://public.activestate.com/cgi-bin/perlbrowse/p/33566
468http://www.dsmit.com/cons/
469http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
23c4d79e 470
6e7dc4a9 471__END__
23c4d79e
LB
472
473=head1 NAME
474
475checkURL.pl - Check that all the URLs in the Perl source are valid
476
477=head1 DESCRIPTION
478
479This program checks that all the URLs in the Perl source are valid. It
480checks HTTP and FTP links in parallel and contains a list of known
481bad example links in its source. It takes 4 minutes to run on my
482machine. The results are written to 'uris.txt' and list the filename,
483the URL and the error:
484
485 * ext/Locale-Maketext/lib/Locale/Maketext.pod
486 http://sunsite.dk/RFC/rfc/rfc2277.html
487 404 Not Found
488 ...
489
490It should be run every so often and links fixed and upstream authors
491notified.
492
493Note that the web is unstable and some websites are temporarily down.