This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade Unicode-Collate from version 1.17 to 1.18
[perl5.git] / Porting / checkURL.pl
... / ...
CommitLineData
1#!perl
2use strict;
3use warnings;
4use autodie;
5use feature qw(say);
6require File::Find::Rule;
7require File::Slurp;
8require File::Spec;
9require IO::Socket::SSL;
10use List::Util qw(sum);
11require LWP::UserAgent;
12require Net::FTP;
13require Parallel::Fork::BossWorkerAsync;
14require Term::ProgressBar::Simple;
15require URI::Find::Simple;
16$| = 1;
17
18my %ignore;
19while ( my $line = <main::DATA> ) {
20 chomp $line;
21 next if $line =~ /^#/;
22 next unless $line;
23 $ignore{$line} = 1;
24}
25
26my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
27$ua->timeout(58);
28$ua->env_proxy;
29
30my @filenames = @ARGV;
31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32 unless @filenames;
33
34my $total_bytes = sum map {-s} @filenames;
35
36my $extract_progress = Term::ProgressBar::Simple->new(
37 { count => $total_bytes,
38 name => 'Extracting URIs',
39 }
40);
41
42my %uris;
43foreach my $filename (@filenames) {
44 next if $filename =~ /uris\.txt/;
45 next if $filename =~ /check_uris/;
46 next if $filename =~ /\.patch$/;
47 next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48 next if $filename =~ /checkURL\.pl$/;
49 my $contents = File::Slurp::read_file($filename);
50 my @uris = URI::Find::Simple::list_uris($contents);
51 foreach my $uri (@uris) {
52 next unless $uri =~ /^(http|ftp)/;
53 next if $ignore{$uri};
54
55 # no need to hit rt.perl.org
56 next
57 if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
58
59 # no need to hit rt.cpan.org
60 next
61 if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
62
63 # no need to hit google groups (weird redirect LWP does not like)
64 next
65 if $uri =~ m{^http://groups\.google\.com/};
66
67 push @{ $uris{$uri} }, $filename;
68 }
69 $extract_progress += -s $filename;
70}
71
72my $bw = Parallel::Fork::BossWorkerAsync->new(
73 work_handler => \&work_alarmed,
74 global_timeout => 120,
75 worker_count => 20,
76);
77
78foreach my $uri ( keys %uris ) {
79 my @filenames = @{ $uris{$uri} };
80 $bw->add_work( { uri => $uri, filenames => \@filenames } );
81}
82
83undef $extract_progress;
84
85my $fetch_progress = Term::ProgressBar::Simple->new(
86 { count => scalar( keys %uris ),
87 name => 'Fetching URIs',
88 }
89);
90
91my %filenames;
92while ( $bw->pending() ) {
93 my $response = $bw->get_result();
94 my $uri = $response->{uri};
95 my @filenames = @{ $response->{filenames} };
96 my $is_success = $response->{is_success};
97 my $message = $response->{message};
98
99 unless ($is_success) {
100 foreach my $filename (@filenames) {
101 push @{ $filenames{$filename} },
102 { uri => $uri, message => $message };
103 }
104 }
105 $fetch_progress++;
106}
107$bw->shut_down();
108
109my $fh = IO::File->new('> uris.txt');
110foreach my $filename ( sort keys %filenames ) {
111 $fh->say("* $filename");
112 my @bits = @{ $filenames{$filename} };
113 foreach my $bit (@bits) {
114 my $uri = $bit->{uri};
115 my $message = $bit->{message};
116 $fh->say(" $uri");
117 $fh->say(" $message");
118 }
119}
120$fh->close;
121
122say 'Finished, see uris.txt';
123
124sub work_alarmed {
125 my $conf = shift;
126 eval {
127 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
128 alarm 60;
129 $conf = work($conf);
130 alarm 0;
131 };
132 if ($@) {
133 $conf->{is_success} = 0;
134 $conf->{message} = 'Timed out';
135
136 }
137 return $conf;
138}
139
140sub work {
141 my $conf = shift;
142 my $uri = $conf->{uri};
143 my @filenames = @{ $conf->{filenames} };
144
145 if ( $uri =~ /^http/ ) {
146 my $uri_without_fragment = URI->new($uri);
147 my $fragment = $uri_without_fragment->fragment(undef);
148 my $response = $ua->head($uri_without_fragment);
149
150 $conf->{is_success} = $response->is_success;
151 $conf->{message} = $response->status_line;
152 return $conf;
153 } else {
154
155 my $uri_object = URI->new($uri);
156 my $host = $uri_object->host;
157 my $path = $uri_object->path;
158 my ( $volume, $directories, $filename )
159 = File::Spec->splitpath($path);
160
161 my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
162 unless ($ftp) {
163 $conf->{is_succcess} = 0;
164 $conf->{message} = "Can not connect to $host: $@";
165 return $conf;
166 }
167
168 my $can_login = $ftp->login( "anonymous", '-anonymous@' );
169 unless ($can_login) {
170 $conf->{is_success} = 0;
171 $conf->{message} = "Can not login ", $ftp->message;
172 return $conf;
173 }
174
175 my $can_binary = $ftp->binary();
176 unless ($can_binary) {
177 $conf->{is_success} = 0;
178 $conf->{message} = "Can not binary ", $ftp->message;
179 return $conf;
180 }
181
182 my $can_cwd = $ftp->cwd($directories);
183 unless ($can_cwd) {
184 $conf->{is_success} = 0;
185 $conf->{message} = "Can not cwd to $directories ", $ftp->message;
186 return $conf;
187 }
188
189 if ($filename) {
190 my $can_size = $ftp->size($filename);
191 unless ($can_size) {
192 $conf->{is_success} = 0;
193 $conf->{message}
194 = "Can not size $filename in $directories",
195 $ftp->message;
196 return $conf;
197 }
198 } else {
199 my ($can_dir) = $ftp->dir;
200 unless ($can_dir) {
201 my ($can_ls) = $ftp->ls;
202 unless ($can_ls) {
203 $conf->{is_success} = 0;
204 $conf->{message}
205 = "Can not dir or ls in $directories ",
206 $ftp->message;
207 return $conf;
208 }
209 }
210 }
211
212 $conf->{is_success} = 1;
213 return $conf;
214 }
215}
216
217__DATA__
218# these are fine but give errors
219ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
220ftp://ftp.stratus.com/pub/vos/utility/utility.html
221
222# these are missing, sigh
223ftp://ftp.sco.com/SLS/ptf7051e.Z
224http://perlmonks.thepen.com/42898.html
225http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
226http://public.activestate.com/cgi-bin/perlbrowse
227http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
228http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
229http://my.smithmicro.com/mac/stuffit/
230http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
231http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
232
233# these are URI extraction bugs
234http://www.perl.org/E
235http://en.wikipedia.org/wiki/SREC_(file_format
236http://somewhere.else',-type=/
237ftp:passive-mode
238ftp:
239http:[-
240http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
241http://www.xray.mpe.mpg.de/mailing-lists/perl5-
242http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
243http://perl.come/
244http://www.perl.come/
245
246# these are used as an example
247http://example.com/
248http://something.here/
249http://users.perl5.git.perl.org/~yourlogin/
250http://github.com/USERNAME/perl/tree/orange
251http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
252http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
253http://somewhere.else$/
254http://somewhere.else$/
255http://somewhere.else/bin/foo&bar',-Type=
256http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
257http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
258http://www.perl.org/test.cgi
259http://cpan2.local/
260http://search.cpan.org/perldoc?
261http://cpan1.local/
262http://cpan.dev.local/CPAN
263http:///
264ftp://
265ftp://myurl/
266ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
267http://www14.software.ibm.com/webapp/download/downloadaz.jsp
268http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
269http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
270http://localhost/tmp/index.txt
271http://example.com/foo/bar.html
272http://example.com/Text-Bastardize-1.06.tar.gz
273ftp://example.com/sources/packages.txt
274http://example.com/sources/packages.txt
275http://example.com/sources
276ftp://example.com/sources
277http://some.where.com/dir/file.txt
278http://some.where.com/dir/a.txt
279http://foo.com/X.tgz
280ftp://foo.com/X.tgz
281http://foo/
282http://www.foo.com:8000/
283http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
284http://decoded/mirror/path
285http://a/b/c/d/e/f/g/h/i/j
286http://foo/bar.gz
287ftp://ftp.perl.org
288http://purl.org/rss/1.0/modules/taxonomy/
289ftp://ftp.sun.ac.za/CPAN/CPAN/
290ftp://ftp.cpan.org/pub/mirror/index.txt
291ftp://cpan.org/pub/mirror/index.txt
292http://example.com/~eh/
293http://plagger.org/.../rss
294http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
295http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
296http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
297http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
298http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
299http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
300http://module-build.sourceforge.net/META-spec-new.html
301http://module-build.sourceforge.net/META-spec-v1.4.html
302http://www.cs.vu.nl/~tmgil/vi.html
303http://perlcomposer.sourceforge.net/vperl.html
304http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
305http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
306http://world.std.com/~aep/ptkdb/
307http://www.castlelink.co.uk/object_system/
308http://www.fh-wedel.de/elvis/
309ftp://ftp.blarg.net/users/amol/zsh/
310ftp://ftp.funet.fi/pub/languages/perl/CPAN
311http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
312http://users.perl5.git.perl.org/~USERNAME
313http://foo/x//y/script.cgi/a//b
314http://xxx/script.cgi/http://foo
315http://foo/./x//z/script.cgi/a/../b//c
316http://somewhere.else/in/movie/land
317http://somewhere.else/finished.html
318http://somewhere.else/bin/foo&bar$
319http://somewhere.else/
320http://proxy:8484/
321http://proxy/
322http://myrepo.example.com/
323http://remote/source
324https://example.com/
325http://example.com:1024/
326http:///path?foo=bar
327http://[::]:1024/
328http://([/
329http://example.com:9000/index.html
330http://proxy.example.com:8080/
331http:///index.html
332http://[www.json::pp.org]/
333http://localhost/
334http://foo.example.com/
335http://abc.com/a.js
336http://whatever/man/1/crontab
337http://abc.com/c.js
338http://whatever/Foo%3A%3ABar
339http://abc.com/b.js
340http://remote.server.com/jquery.css
341http://some.other.com/page.html
342https://text.com/1/2
343https://text.com/1/2
344http://link.included.here?o=1&p=2
345http://link.included.here?o=1&amp;p=2
346http://link.included.here?o=1&amp;p=2
347http://link.included.here/
348http://foo/x//y/script.cgi/a//b
349http://xxx/script.cgi/http://foo
350http://foo/./x//z/script.cgi/a/../b//c
351http://somewhere.else/in/movie/land
352http://somewhere.else/finished.html
353http://webproxy:3128/
354http://www/
355
356# these are used to generate or match URLs
357http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
358http://www.cpantesters.org/show/%s.yaml
359ftp://(.*?)/(.*)/(.*
360ftp://(.*?)/(.*)/(.*
361ftp://(.*?)/(.*)/(.*
362ftp://ftp.foo.bar/
363http://$host/
364http://wwwe%3C46/
365ftp:/
366http://$addr/mark?commit=$
367http://search.cpan.org/~
368http:/
369ftp:%5Cn$url
370http://www.ietf.org/rfc/rfc$2.txt
371http://search.cpan.org/~
372ftp:%5Cn$url
373
374# weird redirects that LWP doesn't like
375http://www.theperlreview.com/community_calendar
376http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
377http://sunsolve.sun.com
378
379# broken webserver that doesn't like HEAD requests
380http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
381http://www.w3.org/TR/html4/loose.dtd
382
383# these have been reported upstream to CPAN authors
384http://www.gnu.org/manual/tar/html_node/tar_139.html
385http://www.w3.org/pub/WWW/TR/Wd-css-1.html
386http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
387http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
388http://search.cpan.org/search?query=Module::Build::Convert
389http://www.refcnt.org/papers/module-build-convert
390http://csrc.nist.gov/cryptval/shs.html
391http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
392http://www.debian.or.jp/~kubota/unicode-symbols.html.en
393http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
394http://www.debian.or.jp/~kubota/unicode-symbols.html.en
395http://rfc.net/rfc2781.html
396http://www.icu-project.org/charset/
397http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
398http://www.rfc-editor.org/
399http://www.rfc.net/
400http://www.oreilly.com/people/authors/lunde/cjk_inf.html
401http://www.oreilly.com/catalog/cjkvinfo/
402http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
403http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
404http://www.egt.ie/standards/iso3166/iso3166-1-en.html
405http://www.bsi-global.com/iso4217currency
406http://www.plover.com/~mjd/perl/Memoize/
407http://www.plover.com/~mjd/perl/MiniMemoize/
408http://www.sysadminmag.com/tpj/issues/vol5_5/
409ftp://ftp.tpc.int/tpc/server/UNIX/
410http://www.nara.gov/genealogy/
411http://home.utah-inter.net/kinsearch/Soundex.html
412http://www.nara.gov/genealogy/soundex/soundex.html
413http://rfc.net/rfc3461.html
414ftp://ftp.cs.pdx.edu/pub/elvis/
415http://www.fh-wedel.de/elvis/
416http://lists.perl.org/list/perl-mvs.html
417http://www.cpan.org/ports/os2/
418http://github.com/dagolden/cpan-meta-spec
419http://github.com/dagolden/cpan-meta-spec/issues
420http://www.opensource.org/licenses/lgpl-license.phpt
421http://reality.sgi.com/ariel
422http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
423http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
424http://www.nsrl.nist.gov/testdata/
425http://public.activestate.com/cgi-bin/perlbrowse/p/31194
426http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
427http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
428http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
429http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
430http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
431http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
432http://github.com/schwern/extutils-makemaker
433https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
434http://www.json.org/JSON::PP_checker/
435ftp://ftp.kiae.su/pub/unix/fido/
436http://www.gallistel.net/nparker/weather/code/
437http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
438ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
439http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
440http://public.activestate.com/cgi-bin/perlbrowse/p/33567
441http://public.activestate.com/cgi-bin/perlbrowse/p/33566
442http://www.dsmit.com/cons/
443http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
444
445__END__
446
447=head1 NAME
448
449checkURL.pl - Check that all the URLs in the Perl source are valid
450
451=head1 DESCRIPTION
452
453This program checks that all the URLs in the Perl source are valid. It
454checks HTTP and FTP links in parallel and contains a list of known
455bad example links in its source. It takes 4 minutes to run on my
456machine. The results are written to 'uris.txt' and list the filename,
457the URL and the error:
458
459 * ext/Locale-Maketext/lib/Locale/Maketext.pod
460 http://sunsite.dk/RFC/rfc/rfc2277.html
461 404 Not Found
462 ...
463
464It should be run every so often and links fixed and upstream authors
465notified.
466
467Note that the web is unstable and some websites are temporarily down.