This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Regression test for 34394ecd - SVs that were only on the tmps stack leaked.
[perl5.git] / Porting / checkURL.pl
1 #!perl
2 use strict;
3 use warnings;
4 use autodie;
5 use feature qw(say);
6 use File::Find::Rule;
7 use File::Slurp;
8 use File::Spec;
9 use IO::Socket::SSL;
10 use List::Util qw(sum);
11 use LWP::UserAgent;
12 use Net::FTP;
13 use Parallel::Fork::BossWorkerAsync;
14 use Term::ProgressBar::Simple;
15 use URI::Find::Simple qw( list_uris );
16 $| = 1;
17
18 my %ignore;
19 while ( my $line = <main::DATA> ) {
20     chomp $line;
21     next if $line =~ /^#/;
22     next unless $line;
23     $ignore{$line} = 1;
24 }
25
26 my $ua = LWP::UserAgent->new;
27 $ua->timeout(58);
28 $ua->env_proxy;
29
30 my @filenames = @ARGV;
31 @filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32     unless @filenames;
33
34 my $total_bytes = sum map {-s} @filenames;
35
36 my $extract_progress = Term::ProgressBar::Simple->new(
37     {   count => $total_bytes,
38         name  => 'Extracting URIs',
39     }
40 );
41
42 my %uris;
43 foreach my $filename (@filenames) {
44     next if $filename =~ /uris\.txt/;
45     next if $filename =~ /check_uris/;
46     next if $filename =~ /\.patch$/;
47     my $contents = read_file($filename);
48     my @uris     = list_uris($contents);
49     foreach my $uri (@uris) {
50         next unless $uri =~ /^(http|ftp)/;
51         next if $ignore{$uri};
52
53         # no need to hit rt.perl.org
54         next
55             if $uri =~ m{^http://rt.perl.org/rt3/Ticket/Display.html?id=\d+$};
56
57         # no need to hit rt.cpan.org
58         next
59             if $uri =~ m{^http://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
60         push @{ $uris{$uri} }, $filename;
61     }
62     $extract_progress += -s $filename;
63 }
64
65 my $bw = Parallel::Fork::BossWorkerAsync->new(
66     work_handler   => \&work_alarmed,
67     global_timeout => 120,
68     worker_count   => 20,
69 );
70
71 foreach my $uri ( keys %uris ) {
72     my @filenames = @{ $uris{$uri} };
73     $bw->add_work( { uri => $uri, filenames => \@filenames } );
74 }
75
76 undef $extract_progress;
77
78 my $fetch_progress = Term::ProgressBar::Simple->new(
79     {   count => scalar( keys %uris ),
80         name  => 'Fetching URIs',
81     }
82 );
83
84 my %filenames;
85 while ( $bw->pending() ) {
86     my $response   = $bw->get_result();
87     my $uri        = $response->{uri};
88     my @filenames  = @{ $response->{filenames} };
89     my $is_success = $response->{is_success};
90     my $message    = $response->{message};
91
92     unless ($is_success) {
93         foreach my $filename (@filenames) {
94             push @{ $filenames{$filename} },
95                 { uri => $uri, message => $message };
96         }
97     }
98     $fetch_progress++;
99 }
100 $bw->shut_down();
101
102 my $fh = IO::File->new('> uris.txt');
103 foreach my $filename ( sort keys %filenames ) {
104     $fh->say("* $filename");
105     my @bits = @{ $filenames{$filename} };
106     foreach my $bit (@bits) {
107         my $uri     = $bit->{uri};
108         my $message = $bit->{message};
109         $fh->say("  $uri");
110         $fh->say("    $message");
111     }
112 }
113 $fh->close;
114
115 say 'Finished, see uris.txt';
116
117 sub work_alarmed {
118     my $conf = shift;
119     eval {
120         local $SIG{ALRM} = sub { die "alarm\n" };    # NB: \n required
121         alarm 60;
122         $conf = work($conf);
123         alarm 0;
124     };
125     if ($@) {
126         $conf->{is_success} = 0;
127         $conf->{message}    = 'Timed out';
128
129     }
130     return $conf;
131 }
132
133 sub work {
134     my $conf      = shift;
135     my $uri       = $conf->{uri};
136     my @filenames = @{ $conf->{filenames} };
137
138     if ( $uri =~ /^http/ ) {
139         my $uri_without_fragment = URI->new($uri);
140         my $fragment             = $uri_without_fragment->fragment(undef);
141         my $response             = $ua->head($uri_without_fragment);
142
143         $conf->{is_success} = $response->is_success;
144         $conf->{message}    = $response->status_line;
145         return $conf;
146     } else {
147
148         my $uri_object = URI->new($uri);
149         my $host       = $uri_object->host;
150         my $path       = $uri_object->path;
151         my ( $volume, $directories, $filename )
152             = File::Spec->splitpath($path);
153
154         my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
155         unless ($ftp) {
156             $conf->{is_succcess} = 0;
157             $conf->{message}     = "Can not connect to $host: $@";
158             return $conf;
159         }
160
161         my $can_login = $ftp->login( "anonymous", '-anonymous@' );
162         unless ($can_login) {
163             $conf->{is_success} = 0;
164             $conf->{message} = "Can not login ", $ftp->message;
165             return $conf;
166         }
167
168         my $can_binary = $ftp->binary();
169         unless ($can_binary) {
170             $conf->{is_success} = 0;
171             $conf->{message} = "Can not binary ", $ftp->message;
172             return $conf;
173         }
174
175         my $can_cwd = $ftp->cwd($directories);
176         unless ($can_cwd) {
177             $conf->{is_success} = 0;
178             $conf->{message} = "Can not cwd to $directories ", $ftp->message;
179             return $conf;
180         }
181
182         if ($filename) {
183             my $can_size = $ftp->size($filename);
184             unless ($can_size) {
185                 $conf->{is_success} = 0;
186                 $conf->{message}
187                     = "Can not size $filename in $directories",
188                     $ftp->message;
189                 return $conf;
190             }
191         } else {
192             my ($can_dir) = $ftp->dir;
193             unless ($can_dir) {
194                 my ($can_ls) = $ftp->ls;
195                 unless ($can_ls) {
196                     $conf->{is_success} = 0;
197                     $conf->{message}
198                         = "Can not dir or ls in $directories ",
199                         $ftp->message;
200                     return $conf;
201                 }
202             }
203         }
204
205         $conf->{is_success} = 1;
206         return $conf;
207     }
208 }
209
210 __DATA__
211 # these are fine but give errors
212 ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
213 ftp://ftp.stratus.com/pub/vos/utility/utility.html
214
215 # this is missing, sigh
216 ftp://ftp.sco.com/SLS/ptf7051e.Z
217 http://perlmonks.thepen.com/42898.html
218
219 # this are URI extraction bugs
220 http://www.perl.org/E
221 http://en.wikipedia.org/wiki/SREC_(file_format
222 http://somewhere.else',-type=/
223 ftp:passive-mode
224 ftp:
225 http:[-
226 http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
227 http://www.xray.mpe.mpg.de/mailing-lists/perl5-
228 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
229
230 # these are used as an example
231 http://example.com/
232 http://something.here/
233 http://users.perl5.git.perl.org/~yourlogin/
234 http://github.com/USERNAME/perl/tree/orange
235 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
236 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
237 http://somewhere.else$/
238 http://somewhere.else$/
239 http://somewhere.else/bin/foo&bar',-Type=
240 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
241 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
242 http://www.perl.org/test.cgi
243 http://cpan2.local/
244 http://search.cpan.org/perldoc?
245 http://cpan1.local/
246 http://cpan.dev.local/CPAN
247 http:///
248 ftp://
249 ftp://myurl/
250 ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
251 http://www14.software.ibm.com/webapp/download/downloadaz.jsp
252 http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
253 http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
254 http://localhost/tmp/index.txt
255 http://example.com/foo/bar.html
256 http://example.com/Text-Bastardize-1.06.tar.gz
257 ftp://example.com/sources/packages.txt
258 http://example.com/sources/packages.txt
259 http://example.com/sources
260 ftp://example.com/sources
261 http://some.where.com/dir/file.txt
262 http://some.where.com/dir/a.txt
263 http://foo.com/X.tgz
264 ftp://foo.com/X.tgz
265 http://foo/
266 http://www.foo.com:8000/
267 http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
268 http://decoded/mirror/path
269 http://a/b/c/d/e/f/g/h/i/j
270 http://foo/bar.gz
271 ftp://ftp.perl.org
272 http://purl.org/rss/1.0/modules/taxonomy/
273 ftp://ftp.sun.ac.za/CPAN/CPAN/
274 ftp://ftp.cpan.org/pub/mirror/index.txt
275 ftp://cpan.org/pub/mirror/index.txt
276 http://example.com/~eh/
277 http://plagger.org/.../rss
278 http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
279 http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
280 http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
281 http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
282 http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
283 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip  
284 http://module-build.sourceforge.net/META-spec-new.html
285 http://module-build.sourceforge.net/META-spec-v1.4.html
286 http://www.cs.vu.nl/~tmgil/vi.html
287 http://perlcomposer.sourceforge.net/vperl.html
288 http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
289 http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
290 http://world.std.com/~aep/ptkdb/
291 http://www.castlelink.co.uk/object_system/
292 http://www.fh-wedel.de/elvis/
293 ftp://ftp.blarg.net/users/amol/zsh/
294 ftp://ftp.funet.fi/pub/languages/perl/CPAN
295 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
296
297 # these are used to generate or match URLs
298 http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
299 http://www.cpantesters.org/show/%s.yaml
300 ftp://(.*?)/(.*)/(.*
301 ftp://(.*?)/(.*)/(.*
302 ftp://(.*?)/(.*)/(.*
303 ftp://ftp.foo.bar/
304 http://$host/
305 http://wwwe%3C46/
306 ftp:/
307
308 # weird redirects that LWP doesn't like
309 http://www.theperlreview.com/community_calendar
310 http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
311 http://groups.google.com/
312 http://groups.google.com/group/comp.lang.perl.misc/topics
313 http://groups.google.co.uk/group/perl.perl5.porters/browse_thread/thread/b4e2b1d88280ddff/630b667a66e3509f?#630b667a66e3509f
314 http://groups.google.com/group/comp.sys.sgi.admin/msg/3ad8353bc4ce3cb0
315 http://groups.google.com/group/perl.module.build/browse_thread/thread/c8065052b2e0d741
316 http://groups.google.com/group/perl.perl5.porters/browse_thread/thread/adaffaaf939b042e/20dafc298df737f0%2320dafc298df737f0?sa=X&oi=groupsr&start=0&num=3
317
318 # broken webserver that doesn't like HEAD requests
319 http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
320
321 # these have been reported upstream to CPAN authors
322 http://www.gnu.org/manual/tar/html_node/tar_139.html
323 http://www.w3.org/pub/WWW/TR/Wd-css-1.html
324 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
325 http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
326 http://search.cpan.org/search?query=Module::Build::Convert
327 http://www.refcnt.org/papers/module-build-convert
328 http://csrc.nist.gov/cryptval/shs.html
329 http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
330 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
331 http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
332 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
333 http://rfc.net/rfc2781.html
334 http://www.icu-project.org/charset/
335 http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
336 http://www.rfc-editor.org/
337 http://www.rfc.net/
338 http://www.oreilly.com/people/authors/lunde/cjk_inf.html
339 http://www.oreilly.com/catalog/cjkvinfo/
340 http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
341 http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
342 http://www.egt.ie/standards/iso3166/iso3166-1-en.html
343 http://www.bsi-global.com/iso4217currency
344 http://www.plover.com/~mjd/perl/Memoize/
345 http://www.plover.com/~mjd/perl/MiniMemoize/
346 http://www.sysadminmag.com/tpj/issues/vol5_5/
347 ftp://ftp.tpc.int/tpc/server/UNIX/
348 http://www.nara.gov/genealogy/
349 http://home.utah-inter.net/kinsearch/Soundex.html
350 http://www.nara.gov/genealogy/soundex/soundex.html
351 http://rfc.net/rfc3461.html
352 ftp://ftp.cs.pdx.edu/pub/elvis/
353 http://www.fh-wedel.de/elvis/
354
355 __END__
356
357 =head1 NAME
358
359 checkURL.pl - Check that all the URLs in the Perl source are valid
360
361 =head1 DESCRIPTION
362
363 This program checks that all the URLs in the Perl source are valid. It
364 checks HTTP and FTP links in parallel and contains a list of known
365 bad example links in its source. It takes 4 minutes to run on my
366 machine. The results are written to 'uris.txt' and list the filename,
367 the URL and the error:
368
369   * ext/Locale-Maketext/lib/Locale/Maketext.pod
370     http://sunsite.dk/RFC/rfc/rfc2277.html
371       404 Not Found
372   ...
373
374 It should be run every so often and links fixed and upstream authors
375 notified.
376
377 Note that the web is unstable and some websites are temporarily down.