Remove improper use of each() in B::walksymtable and fix ext/B/t/xref.t
[perl.git] / Porting / checkURL.pl
1 #!perl
2 use strict;
3 use warnings;
4 use autodie;
5 use feature qw(say);
6 require File::Find::Rule;
7 require File::Slurp;
8 require File::Spec;
9 require IO::Socket::SSL;
10 use List::Util qw(sum);
11 require LWP::UserAgent;
12 require Net::FTP;
13 require Parallel::Fork::BossWorkerAsync;
14 require Term::ProgressBar::Simple;
15 require URI::Find::Simple;
16 $| = 1;
17
18 my %ignore;
19 while ( my $line = <main::DATA> ) {
20     chomp $line;
21     next if $line =~ /^#/;
22     next unless $line;
23     $ignore{$line} = 1;
24 }
25
26 my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
27 $ua->timeout(58);
28 $ua->env_proxy;
29
30 my @filenames = @ARGV;
31 @filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32     unless @filenames;
33
34 my $total_bytes = sum map {-s} @filenames;
35
36 my $extract_progress = Term::ProgressBar::Simple->new(
37     {   count => $total_bytes,
38         name  => 'Extracting URIs',
39     }
40 );
41
42 my %uris;
43 foreach my $filename (@filenames) {
44     next if $filename =~ /uris\.txt/;
45     next if $filename =~ /check_uris/;
46     next if $filename =~ /\.patch$/;
47     next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48     next if $filename =~ /checkURL\.pl$/;
49     my $contents = File::Slurp::read_file($filename);
50     my @uris     = URI::Find::Simple::list_uris($contents);
51     foreach my $uri (@uris) {
52         next unless $uri =~ /^(http|ftp)/;
53         next if $ignore{$uri};
54
55         # no need to hit rt.perl.org
56         next
57             if $uri =~ m{^https?://rt.perl.org/rt3/Ticket/Display.html?id=\d+$};
58
59         # no need to hit rt.cpan.org
60         next
61             if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
62
63         # no need to hit google groups (weird redirect LWP does not like)
64         next
65             if $uri =~ m{^http://groups\.google\.com/};
66
67         push @{ $uris{$uri} }, $filename;
68     }
69     $extract_progress += -s $filename;
70 }
71
72 my $bw = Parallel::Fork::BossWorkerAsync->new(
73     work_handler   => \&work_alarmed,
74     global_timeout => 120,
75     worker_count   => 20,
76 );
77
78 foreach my $uri ( keys %uris ) {
79     my @filenames = @{ $uris{$uri} };
80     $bw->add_work( { uri => $uri, filenames => \@filenames } );
81 }
82
83 undef $extract_progress;
84
85 my $fetch_progress = Term::ProgressBar::Simple->new(
86     {   count => scalar( keys %uris ),
87         name  => 'Fetching URIs',
88     }
89 );
90
91 my %filenames;
92 while ( $bw->pending() ) {
93     my $response   = $bw->get_result();
94     my $uri        = $response->{uri};
95     my @filenames  = @{ $response->{filenames} };
96     my $is_success = $response->{is_success};
97     my $message    = $response->{message};
98
99     unless ($is_success) {
100         foreach my $filename (@filenames) {
101             push @{ $filenames{$filename} },
102                 { uri => $uri, message => $message };
103         }
104     }
105     $fetch_progress++;
106 }
107 $bw->shut_down();
108
109 my $fh = IO::File->new('> uris.txt');
110 foreach my $filename ( sort keys %filenames ) {
111     $fh->say("* $filename");
112     my @bits = @{ $filenames{$filename} };
113     foreach my $bit (@bits) {
114         my $uri     = $bit->{uri};
115         my $message = $bit->{message};
116         $fh->say("  $uri");
117         $fh->say("    $message");
118     }
119 }
120 $fh->close;
121
122 say 'Finished, see uris.txt';
123
124 sub work_alarmed {
125     my $conf = shift;
126     eval {
127         local $SIG{ALRM} = sub { die "alarm\n" };    # NB: \n required
128         alarm 60;
129         $conf = work($conf);
130         alarm 0;
131     };
132     if ($@) {
133         $conf->{is_success} = 0;
134         $conf->{message}    = 'Timed out';
135
136     }
137     return $conf;
138 }
139
140 sub work {
141     my $conf      = shift;
142     my $uri       = $conf->{uri};
143     my @filenames = @{ $conf->{filenames} };
144
145     if ( $uri =~ /^http/ ) {
146         my $uri_without_fragment = URI->new($uri);
147         my $fragment             = $uri_without_fragment->fragment(undef);
148         my $response             = $ua->head($uri_without_fragment);
149
150         $conf->{is_success} = $response->is_success;
151         $conf->{message}    = $response->status_line;
152         return $conf;
153     } else {
154
155         my $uri_object = URI->new($uri);
156         my $host       = $uri_object->host;
157         my $path       = $uri_object->path;
158         my ( $volume, $directories, $filename )
159             = File::Spec->splitpath($path);
160
161         my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
162         unless ($ftp) {
163             $conf->{is_succcess} = 0;
164             $conf->{message}     = "Can not connect to $host: $@";
165             return $conf;
166         }
167
168         my $can_login = $ftp->login( "anonymous", '-anonymous@' );
169         unless ($can_login) {
170             $conf->{is_success} = 0;
171             $conf->{message} = "Can not login ", $ftp->message;
172             return $conf;
173         }
174
175         my $can_binary = $ftp->binary();
176         unless ($can_binary) {
177             $conf->{is_success} = 0;
178             $conf->{message} = "Can not binary ", $ftp->message;
179             return $conf;
180         }
181
182         my $can_cwd = $ftp->cwd($directories);
183         unless ($can_cwd) {
184             $conf->{is_success} = 0;
185             $conf->{message} = "Can not cwd to $directories ", $ftp->message;
186             return $conf;
187         }
188
189         if ($filename) {
190             my $can_size = $ftp->size($filename);
191             unless ($can_size) {
192                 $conf->{is_success} = 0;
193                 $conf->{message}
194                     = "Can not size $filename in $directories",
195                     $ftp->message;
196                 return $conf;
197             }
198         } else {
199             my ($can_dir) = $ftp->dir;
200             unless ($can_dir) {
201                 my ($can_ls) = $ftp->ls;
202                 unless ($can_ls) {
203                     $conf->{is_success} = 0;
204                     $conf->{message}
205                         = "Can not dir or ls in $directories ",
206                         $ftp->message;
207                     return $conf;
208                 }
209             }
210         }
211
212         $conf->{is_success} = 1;
213         return $conf;
214     }
215 }
216
217 __DATA__
218 # these are fine but give errors
219 ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
220 ftp://ftp.stratus.com/pub/vos/utility/utility.html
221
222 # these are missing, sigh
223 ftp://ftp.sco.com/SLS/ptf7051e.Z
224 http://perlmonks.thepen.com/42898.html
225 http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
226 http://public.activestate.com/cgi-bin/perlbrowse
227 http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
228 http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
229 http://my.smithmicro.com/mac/stuffit/
230 http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
231 http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
232
233 # these are URI extraction bugs
234 http://www.perl.org/E
235 http://en.wikipedia.org/wiki/SREC_(file_format
236 http://somewhere.else',-type=/
237 ftp:passive-mode
238 ftp:
239 http:[-
240 http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
241 http://www.xray.mpe.mpg.de/mailing-lists/perl5-
242 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
243 http://perl.come/
244 http://www.perl.come/
245
246 # these are used as an example
247 http://example.com/
248 http://something.here/
249 http://users.perl5.git.perl.org/~yourlogin/
250 http://github.com/USERNAME/perl/tree/orange
251 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
252 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
253 http://somewhere.else$/
254 http://somewhere.else$/
255 http://somewhere.else/bin/foo&bar',-Type=
256 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
257 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
258 http://www.perl.org/test.cgi
259 http://cpan2.local/
260 http://search.cpan.org/perldoc?
261 http://cpan1.local/
262 http://cpan.dev.local/CPAN
263 http:///
264 ftp://
265 ftp://myurl/
266 ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
267 http://www14.software.ibm.com/webapp/download/downloadaz.jsp
268 http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
269 http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
270 http://localhost/tmp/index.txt
271 http://example.com/foo/bar.html
272 http://example.com/Text-Bastardize-1.06.tar.gz
273 ftp://example.com/sources/packages.txt
274 http://example.com/sources/packages.txt
275 http://example.com/sources
276 ftp://example.com/sources
277 http://some.where.com/dir/file.txt
278 http://some.where.com/dir/a.txt
279 http://foo.com/X.tgz
280 ftp://foo.com/X.tgz
281 http://foo/
282 http://www.foo.com:8000/
283 http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
284 http://decoded/mirror/path
285 http://a/b/c/d/e/f/g/h/i/j
286 http://foo/bar.gz
287 ftp://ftp.perl.org
288 http://purl.org/rss/1.0/modules/taxonomy/
289 ftp://ftp.sun.ac.za/CPAN/CPAN/
290 ftp://ftp.cpan.org/pub/mirror/index.txt
291 ftp://cpan.org/pub/mirror/index.txt
292 http://example.com/~eh/
293 http://plagger.org/.../rss
294 http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
295 http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
296 http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
297 http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
298 http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
299 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip  
300 http://module-build.sourceforge.net/META-spec-new.html
301 http://module-build.sourceforge.net/META-spec-v1.4.html
302 http://www.cs.vu.nl/~tmgil/vi.html
303 http://perlcomposer.sourceforge.net/vperl.html
304 http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
305 http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
306 http://world.std.com/~aep/ptkdb/
307 http://www.castlelink.co.uk/object_system/
308 http://www.fh-wedel.de/elvis/
309 ftp://ftp.blarg.net/users/amol/zsh/
310 ftp://ftp.funet.fi/pub/languages/perl/CPAN
311 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
312 http://users.perl5.git.perl.org/~USERNAME
313 http://foo/x//y/script.cgi/a//b
314 http://xxx/script.cgi/http://foo
315 http://foo/./x//z/script.cgi/a/../b//c
316 http://somewhere.else/in/movie/land
317 http://somewhere.else/finished.html
318 http://somewhere.else/bin/foo&bar$
319 http://somewhere.else/
320 http://proxy:8484/
321 http://proxy/
322 http://myrepo.example.com/
323 http://remote/source
324 https://example.com/
325 http://example.com:1024/
326 http:///path?foo=bar
327 http://[::]:1024/
328 http://([/
329 http://example.com:9000/index.html
330 http://proxy.example.com:8080/
331 http:///index.html
332 http://[www.json::pp.org]/
333 http://localhost/
334 http://foo.example.com/
335 http://abc.com/a.js
336 http://whatever/man/1/crontab
337 http://abc.com/c.js
338 http://whatever/Foo%3A%3ABar
339 http://abc.com/b.js
340 http://remote.server.com/jquery.css
341 http://some.other.com/page.html
342 https://text.com/1/2
343 https://text.com/1/2
344 http://link.included.here?o=1&p=2
345 http://link.included.here?o=1&amp;p=2
346 http://link.included.here?o=1&amp;p=2
347 http://link.included.here/
348 http://foo/x//y/script.cgi/a//b
349 http://xxx/script.cgi/http://foo
350 http://foo/./x//z/script.cgi/a/../b//c
351 http://somewhere.else/in/movie/land
352 http://somewhere.else/finished.html
353 http://webproxy:3128/
354 http://www/
355
356 # these are used to generate or match URLs
357 http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
358 http://www.cpantesters.org/show/%s.yaml
359 ftp://(.*?)/(.*)/(.*
360 ftp://(.*?)/(.*)/(.*
361 ftp://(.*?)/(.*)/(.*
362 ftp://ftp.foo.bar/
363 http://$host/
364 http://wwwe%3C46/
365 ftp:/
366 http://$addr/mark?commit=$
367 http://search.cpan.org/~
368 http:/
369 ftp:%5Cn$url
370 http://www.ietf.org/rfc/rfc$2.txt
371 http://search.cpan.org/~
372 ftp:%5Cn$url
373
374 # weird redirects that LWP doesn't like
375 http://www.theperlreview.com/community_calendar
376 http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
377 http://sunsolve.sun.com
378
379 # broken webserver that doesn't like HEAD requests
380 http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
381 http://www.w3.org/TR/html4/loose.dtd
382
383 # these have been reported upstream to CPAN authors
384 http://www.gnu.org/manual/tar/html_node/tar_139.html
385 http://www.w3.org/pub/WWW/TR/Wd-css-1.html
386 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
387 http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
388 http://search.cpan.org/search?query=Module::Build::Convert
389 http://www.refcnt.org/papers/module-build-convert
390 http://csrc.nist.gov/cryptval/shs.html
391 http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
392 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
393 http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
394 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
395 http://rfc.net/rfc2781.html
396 http://www.icu-project.org/charset/
397 http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
398 http://www.rfc-editor.org/
399 http://www.rfc.net/
400 http://www.oreilly.com/people/authors/lunde/cjk_inf.html
401 http://www.oreilly.com/catalog/cjkvinfo/
402 http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
403 http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
404 http://www.egt.ie/standards/iso3166/iso3166-1-en.html
405 http://www.bsi-global.com/iso4217currency
406 http://www.plover.com/~mjd/perl/Memoize/
407 http://www.plover.com/~mjd/perl/MiniMemoize/
408 http://www.sysadminmag.com/tpj/issues/vol5_5/
409 ftp://ftp.tpc.int/tpc/server/UNIX/
410 http://www.nara.gov/genealogy/
411 http://home.utah-inter.net/kinsearch/Soundex.html
412 http://www.nara.gov/genealogy/soundex/soundex.html
413 http://rfc.net/rfc3461.html
414 ftp://ftp.cs.pdx.edu/pub/elvis/
415 http://www.fh-wedel.de/elvis/
416 http://lists.perl.org/list/perl-mvs.html
417 http://www.cpan.org/ports/os2/
418 http://github.com/dagolden/cpan-meta-spec
419 http://github.com/dagolden/cpan-meta-spec/issues
420 http://www.opensource.org/licenses/lgpl-license.phpt
421 http://reality.sgi.com/ariel
422 http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
423 http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
424 http://www.nsrl.nist.gov/testdata/
425 http://public.activestate.com/cgi-bin/perlbrowse/p/31194
426 http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
427 http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
428 http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
429 http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
430 http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
431 http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
432 http://github.com/schwern/extutils-makemaker
433 https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
434 http://www.json.org/JSON::PP_checker/
435 ftp://ftp.kiae.su/pub/unix/fido/
436 http://www.gallistel.net/nparker/weather/code/
437 http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
438 ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
439 http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
440 http://public.activestate.com/cgi-bin/perlbrowse/p/33567
441 http://public.activestate.com/cgi-bin/perlbrowse/p/33566
442 http://www.dsmit.com/cons/
443 http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
444
445 __END__
446
447 =head1 NAME
448
449 checkURL.pl - Check that all the URLs in the Perl source are valid
450
451 =head1 DESCRIPTION
452
453 This program checks that all the URLs in the Perl source are valid. It
454 checks HTTP and FTP links in parallel and contains a list of known
455 bad example links in its source. It takes 4 minutes to run on my
456 machine. The results are written to 'uris.txt' and list the filename,
457 the URL and the error:
458
459   * ext/Locale-Maketext/lib/Locale/Maketext.pod
460     http://sunsite.dk/RFC/rfc/rfc2277.html
461       404 Not Found
462   ...
463
464 It should be run every so often and links fixed and upstream authors
465 notified.
466
467 Note that the web is unstable and some websites are temporarily down.