This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Bump version to 5.33.5
[perl5.git] / Porting / checkURL.pl
1 #!perl
2 use strict;
3 use warnings;
4 use autodie;
5 use feature qw(say);
6 require File::Find::Rule;
7 require File::Slurp;
8 require File::Spec;
9 require IO::Socket::SSL;
10 use List::Util qw(sum);
11 require LWP::UserAgent;
12 require Net::FTP;
13 require Parallel::Fork::BossWorkerAsync;
14 require Term::ProgressBar::Simple;
15 require URI::Find::Simple;
16 $| = 1;
17
18 my %ignore;
19 while ( my $line = <main::DATA> ) {
20     chomp $line;
21     next if $line =~ /^#/;
22     next unless $line;
23     $ignore{$line} = 1;
24 }
25
26 my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
27 $ua->timeout(58);
28 $ua->env_proxy;
29
30 my @filenames = @ARGV;
31 @filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32     unless @filenames;
33
34 my $total_bytes = sum map {-s} @filenames;
35
36 my $extract_progress = Term::ProgressBar::Simple->new(
37     {   count => $total_bytes,
38         name  => 'Extracting URIs',
39     }
40 );
41
42 my %uris;
43 foreach my $filename (@filenames) {
44     next if $filename =~ /uris\.txt/;
45     next if $filename =~ /check_uris/;
46     next if $filename =~ /\.patch$/;
47     next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48     next if $filename =~ /checkURL\.pl$/;
49     my $contents = File::Slurp::read_file($filename);
50     my @uris     = URI::Find::Simple::list_uris($contents);
51     foreach my $uri (@uris) {
52         next unless $uri =~ /^(http|ftp)/;
53         next if $ignore{$uri};
54
55         # no need to hit rt.perl.org
56         next
57             if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
58
59         # no need to hit github
60         next
61             if $uri =~ m{^https?://(?:www\.)?github\.com/[pP]erl/perl5/issues/\d+$};
62
63         # no need to hit rt.cpan.org
64         next
65             if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
66
67         # no need to hit google groups (weird redirect LWP does not like)
68         next
69             if $uri =~ m{^http://groups\.google\.com/};
70
71         push @{ $uris{$uri} }, $filename;
72     }
73     $extract_progress += -s $filename;
74 }
75
76 my $bw = Parallel::Fork::BossWorkerAsync->new(
77     work_handler   => \&work_alarmed,
78     global_timeout => 120,
79     worker_count   => 20,
80 );
81
82 foreach my $uri ( keys %uris ) {
83     my @filenames = @{ $uris{$uri} };
84     $bw->add_work( { uri => $uri, filenames => \@filenames } );
85 }
86
87 undef $extract_progress;
88
89 my $fetch_progress = Term::ProgressBar::Simple->new(
90     {   count => scalar( keys %uris ),
91         name  => 'Fetching URIs',
92     }
93 );
94
95 my %filenames;
96 while ( $bw->pending() ) {
97     my $response   = $bw->get_result();
98     my $uri        = $response->{uri};
99     my @filenames  = @{ $response->{filenames} };
100     my $is_success = $response->{is_success};
101     my $message    = $response->{message};
102
103     unless ($is_success) {
104         foreach my $filename (@filenames) {
105             push @{ $filenames{$filename} },
106                 { uri => $uri, message => $message };
107         }
108     }
109     $fetch_progress++;
110 }
111 $bw->shut_down();
112
113 my $fh = IO::File->new('> uris.txt');
114 foreach my $filename ( sort keys %filenames ) {
115     $fh->say("* $filename");
116     my @bits = @{ $filenames{$filename} };
117     foreach my $bit (@bits) {
118         my $uri     = $bit->{uri};
119         my $message = $bit->{message};
120         $fh->say("  $uri");
121         $fh->say("    $message");
122     }
123 }
124 $fh->close;
125
126 say 'Finished, see uris.txt';
127
128 sub work_alarmed {
129     my $conf = shift;
130     eval {
131         local $SIG{ALRM} = sub { die "alarm\n" };    # NB: \n required
132         alarm 60;
133         $conf = work($conf);
134         alarm 0;
135     };
136     if ($@) {
137         $conf->{is_success} = 0;
138         $conf->{message}    = 'Timed out';
139
140     }
141     return $conf;
142 }
143
144 sub work {
145     my $conf      = shift;
146     my $uri       = $conf->{uri};
147     my @filenames = @{ $conf->{filenames} };
148
149     if ( $uri =~ /^http/ ) {
150         my $uri_without_fragment = URI->new($uri);
151         my $fragment             = $uri_without_fragment->fragment(undef);
152         my $response             = $ua->head($uri_without_fragment);
153
154         $conf->{is_success} = $response->is_success;
155         $conf->{message}    = $response->status_line;
156         return $conf;
157     } else {
158
159         my $uri_object = URI->new($uri);
160         my $host       = $uri_object->host;
161         my $path       = $uri_object->path;
162         my ( $volume, $directories, $filename )
163             = File::Spec->splitpath($path);
164
165         my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
166         unless ($ftp) {
167             $conf->{is_succcess} = 0;
168             $conf->{message}     = "Can not connect to $host: $@";
169             return $conf;
170         }
171
172         my $can_login = $ftp->login( "anonymous", '-anonymous@' );
173         unless ($can_login) {
174             $conf->{is_success} = 0;
175             $conf->{message} = "Can not login ", $ftp->message;
176             return $conf;
177         }
178
179         my $can_binary = $ftp->binary();
180         unless ($can_binary) {
181             $conf->{is_success} = 0;
182             $conf->{message} = "Can not binary ", $ftp->message;
183             return $conf;
184         }
185
186         my $can_cwd = $ftp->cwd($directories);
187         unless ($can_cwd) {
188             $conf->{is_success} = 0;
189             $conf->{message} = "Can not cwd to $directories ", $ftp->message;
190             return $conf;
191         }
192
193         if ($filename) {
194             my $can_size = $ftp->size($filename);
195             unless ($can_size) {
196                 $conf->{is_success} = 0;
197                 $conf->{message}
198                     = "Can not size $filename in $directories",
199                     $ftp->message;
200                 return $conf;
201             }
202         } else {
203             my ($can_dir) = $ftp->dir;
204             unless ($can_dir) {
205                 my ($can_ls) = $ftp->ls;
206                 unless ($can_ls) {
207                     $conf->{is_success} = 0;
208                     $conf->{message}
209                         = "Can not dir or ls in $directories ",
210                         $ftp->message;
211                     return $conf;
212                 }
213             }
214         }
215
216         $conf->{is_success} = 1;
217         return $conf;
218     }
219 }
220
221 __DATA__
222 # these are fine but give errors
223 ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
224 ftp://ftp.stratus.com/pub/vos/utility/utility.html
225
226 # these are missing, sigh
227 ftp://ftp.sco.com/SLS/ptf7051e.Z
228 http://perlmonks.thepen.com/42898.html
229 http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
230 http://public.activestate.com/cgi-bin/perlbrowse
231 http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
232 http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
233 http://my.smithmicro.com/mac/stuffit/
234 http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
235 http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
236 http://www.openzaurus.org/
237 http://Casbah.org/
238 http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe
239 http://www.pvhp.com/~pvhp/
240 http://www.pvhp.com/%7Epvhp/
241 http://www.pvhp.com/%7epvhp/
242 http://www.leo.org
243 http://www.madgoat.com
244 http://www.mks.com/s390/gnu/
245 http://www.research.att.com/sw/tools/uwin/
246 http://www.tpj.com/
247 http://safaribooksonline.com/
248 http://use.perl.org/~autrijus/journal/25768
249 http://www.s390.ibm.com/products/oe/bpxqp1.html
250 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01396.html
251 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01489.html
252 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01491.html
253 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01608.html
254 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02144.html
255 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02998.html
256 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1999-03/msg00520.html
257 http://www.w3.org/Security/Faq/
258
259 # these are URI extraction bugs
260 http://www.perl.org/E
261 http://en.wikipedia.org/wiki/SREC_(file_format
262 http://somewhere.else',-type=/
263 ftp:passive-mode
264 ftp:
265 http:[-
266 http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
267 http://www.xray.mpe.mpg.de/mailing-lists/perl5-
268 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
269 http://perl.come/
270 http://www.perl.come/
271
272 # these are used as an example
273 http://example.com/
274 http://something.here/
275 http://users.perl5.git.perl.org/~yourlogin/
276 http://github.com/USERNAME/perl/tree/orange
277 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
278 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
279 http://somewhere.else$/
280 http://somewhere.else$/
281 http://somewhere.else/bin/foo&bar',-Type=
282 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
283 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
284 http://www.perl.org/test.cgi
285 http://cpan2.local/
286 http://search.cpan.org/perldoc?
287 http://cpan1.local/
288 http://cpan.dev.local/CPAN
289 http:///
290 ftp://
291 ftp://myurl/
292 ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
293 http://www14.software.ibm.com/webapp/download/downloadaz.jsp
294 http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
295 http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
296 http://localhost/tmp/index.txt
297 http://example.com/foo/bar.html
298 http://example.com/Text-Bastardize-1.06.tar.gz
299 ftp://example.com/sources/packages.txt
300 http://example.com/sources/packages.txt
301 http://example.com/sources
302 ftp://example.com/sources
303 http://some.where.com/dir/file.txt
304 http://some.where.com/dir/a.txt
305 http://foo.com/X.tgz
306 ftp://foo.com/X.tgz
307 http://foo/
308 http://www.foo.com:8000/
309 http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
310 http://decoded/mirror/path
311 http://a/b/c/d/e/f/g/h/i/j
312 http://foo/bar.gz
313 ftp://ftp.perl.org
314 http://purl.org/rss/1.0/modules/taxonomy/
315 ftp://ftp.sun.ac.za/CPAN/CPAN/
316 ftp://ftp.cpan.org/pub/mirror/index.txt
317 ftp://cpan.org/pub/mirror/index.txt
318 http://example.com/~eh/
319 http://plagger.org/.../rss
320 http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
321 http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
322 http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
323 http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
324 http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
325 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip  
326 http://module-build.sourceforge.net/META-spec-new.html
327 http://module-build.sourceforge.net/META-spec-v1.4.html
328 http://www.cs.vu.nl/~tmgil/vi.html
329 http://perlcomposer.sourceforge.net/vperl.html
330 http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
331 http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
332 http://world.std.com/~aep/ptkdb/
333 http://www.castlelink.co.uk/object_system/
334 http://www.fh-wedel.de/elvis/
335 ftp://ftp.blarg.net/users/amol/zsh/
336 ftp://ftp.funet.fi/pub/languages/perl/CPAN
337 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
338 http://users.perl5.git.perl.org/~USERNAME
339 http://foo/x//y/script.cgi/a//b
340 http://xxx/script.cgi/http://foo
341 http://foo/./x//z/script.cgi/a/../b//c
342 http://somewhere.else/in/movie/land
343 http://somewhere.else/finished.html
344 http://somewhere.else/bin/foo&bar$
345 http://somewhere.else/
346 http://proxy:8484/
347 http://proxy/
348 http://myrepo.example.com/
349 http://remote/source
350 https://example.com/
351 http://example.com:1024/
352 http:///path?foo=bar
353 http://[::]:1024/
354 http://([/
355 http://example.com:9000/index.html
356 http://proxy.example.com:8080/
357 http:///index.html
358 http://[www.json::pp.org]/
359 http://localhost/
360 http://foo.example.com/
361 http://abc.com/a.js
362 http://whatever/man/1/crontab
363 http://abc.com/c.js
364 http://whatever/Foo%3A%3ABar
365 http://abc.com/b.js
366 http://remote.server.com/jquery.css
367 http://some.other.com/page.html
368 https://text.com/1/2
369 https://text.com/1/2
370 http://link.included.here?o=1&p=2
371 http://link.included.here?o=1&amp;p=2
372 http://link.included.here?o=1&amp;p=2
373 http://link.included.here/
374 http://foo/x//y/script.cgi/a//b
375 http://xxx/script.cgi/http://foo
376 http://foo/./x//z/script.cgi/a/../b//c
377 http://somewhere.else/in/movie/land
378 http://somewhere.else/finished.html
379 http://webproxy:3128/
380 http://www/
381
382 # these are used to generate or match URLs
383 http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
384 http://www.cpantesters.org/show/%s.yaml
385 ftp://(.*?)/(.*)/(.*
386 ftp://(.*?)/(.*)/(.*
387 ftp://(.*?)/(.*)/(.*
388 ftp://ftp.foo.bar/
389 http://$host/
390 http://wwwe%3C46/
391 ftp:/
392 http://$addr/mark?commit=$
393 http://search.cpan.org/~
394 http:/
395 ftp:%5Cn$url
396 http://www.ietf.org/rfc/rfc$2.txt
397 http://search.cpan.org/~
398 ftp:%5Cn$url
399
400 # weird redirects that LWP doesn't like
401 http://www.theperlreview.com/community_calendar
402 http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
403 http://sunsolve.sun.com
404
405 # broken webserver that doesn't like HEAD requests
406 http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
407 http://www.w3.org/TR/html4/loose.dtd
408
409 # these have been reported upstream to CPAN authors
410 http://www.gnu.org/manual/tar/html_node/tar_139.html
411 http://www.w3.org/pub/WWW/TR/Wd-css-1.html
412 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
413 http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
414 http://search.cpan.org/search?query=Module::Build::Convert
415 http://www.refcnt.org/papers/module-build-convert
416 http://csrc.nist.gov/cryptval/shs.html
417 http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
418 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
419 http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
420 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
421 http://rfc.net/rfc2781.html
422 http://www.icu-project.org/charset/
423 http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
424 http://www.rfc-editor.org/
425 http://www.rfc.net/
426 http://www.oreilly.com/people/authors/lunde/cjk_inf.html
427 http://www.oreilly.com/catalog/cjkvinfo/
428 http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
429 http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
430 http://www.egt.ie/standards/iso3166/iso3166-1-en.html
431 http://www.bsi-global.com/iso4217currency
432 http://www.plover.com/~mjd/perl/Memoize/
433 http://www.plover.com/~mjd/perl/MiniMemoize/
434 http://www.sysadminmag.com/tpj/issues/vol5_5/
435 ftp://ftp.tpc.int/tpc/server/UNIX/
436 http://www.nara.gov/genealogy/
437 http://home.utah-inter.net/kinsearch/Soundex.html
438 http://www.nara.gov/genealogy/soundex/soundex.html
439 http://rfc.net/rfc3461.html
440 ftp://ftp.cs.pdx.edu/pub/elvis/
441 http://www.fh-wedel.de/elvis/
442 http://lists.perl.org/list/perl-mvs.html
443 http://www.cpan.org/ports/os2/
444 http://github.com/dagolden/cpan-meta-spec
445 http://github.com/dagolden/cpan-meta-spec/issues
446 http://www.opensource.org/licenses/lgpl-license.phpt
447 http://reality.sgi.com/ariel
448 http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
449 http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
450 http://www.nsrl.nist.gov/testdata/
451 http://public.activestate.com/cgi-bin/perlbrowse/p/31194
452 http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
453 http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
454 http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
455 http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
456 http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
457 http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
458 http://github.com/schwern/extutils-makemaker
459 https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
460 http://www.json.org/JSON::PP_checker/
461 ftp://ftp.kiae.su/pub/unix/fido/
462 http://www.gallistel.net/nparker/weather/code/
463 http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
464 ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
465 http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
466 http://public.activestate.com/cgi-bin/perlbrowse/p/33567
467 http://public.activestate.com/cgi-bin/perlbrowse/p/33566
468 http://www.dsmit.com/cons/
469 http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
470
471 __END__
472
473 =head1 NAME
474
475 checkURL.pl - Check that all the URLs in the Perl source are valid
476
477 =head1 DESCRIPTION
478
479 This program checks that all the URLs in the Perl source are valid. It
480 checks HTTP and FTP links in parallel and contains a list of known
481 bad example links in its source. It takes 4 minutes to run on my
482 machine. The results are written to 'uris.txt' and list the filename,
483 the URL and the error:
484
485   * ext/Locale-Maketext/lib/Locale/Maketext.pod
486     http://sunsite.dk/RFC/rfc/rfc2277.html
487       404 Not Found
488   ...
489
490 It should be run every so often and links fixed and upstream authors
491 notified.
492
493 Note that the web is unstable and some websites are temporarily down.