This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Bump version for 5.35.11
[perl5.git] / Porting / checkURL.pl
1 #!perl
2 use strict;
3 use warnings;
4 use autodie;
5 use feature qw(say);
6 require File::Find::Rule;
7 require File::Slurp;
8 require File::Spec;
9 require IO::Socket::SSL;
10 use List::Util qw(sum);
11 require LWP::UserAgent;
12 require Net::FTP;
13 require Parallel::Fork::BossWorkerAsync;
14 require Term::ProgressBar::Simple;
15 require URI::Find::Simple;
16 $| = 1;
17
18 my %ignore;
19 while ( my $line = <main::DATA> ) {
20     chomp $line;
21     next if $line =~ /^#/;
22     next unless $line;
23     $ignore{$line} = 1;
24 }
25
26 my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
27 $ua->timeout(58);
28 $ua->env_proxy;
29
30 my @filenames = @ARGV;
31 @filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32     unless @filenames;
33
34 my $total_bytes = sum map {-s} @filenames;
35
36 my $extract_progress = Term::ProgressBar::Simple->new(
37     {   count => $total_bytes,
38         name  => 'Extracting URIs',
39     }
40 );
41
42 my %uris;
43 foreach my $filename (@filenames) {
44     next if $filename =~ /uris\.txt/;
45     next if $filename =~ /check_uris/;
46     next if $filename =~ /\.patch$/;
47     next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48     next if $filename =~ /checkURL\.pl$/;
49     my $contents = File::Slurp::read_file($filename);
50     my @uris     = URI::Find::Simple::list_uris($contents);
51     foreach my $uri (@uris) {
52         next unless $uri =~ /^(http|ftp)/;
53         next if $ignore{$uri};
54
55         # no need to hit rt.perl.org
56         next
57             if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
58
59         # no need to hit github
60         next
61             if $uri =~ m{^https?://(?:www\.)?github\.com/[pP]erl/perl5/issues/\d+$};
62
63         # no need to hit rt.cpan.org
64         next
65             if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
66
67         # no need to hit google groups (weird redirect LWP does not like)
68         next
69             if $uri =~ m{^http://groups\.google\.com/};
70
71         push @{ $uris{$uri} }, $filename;
72     }
73     $extract_progress += -s $filename;
74 }
75
76 my $bw = Parallel::Fork::BossWorkerAsync->new(
77     work_handler   => \&work_alarmed,
78     global_timeout => 120,
79     worker_count   => 20,
80 );
81
82 foreach my $uri ( keys %uris ) {
83     my @filenames = @{ $uris{$uri} };
84     $bw->add_work( { uri => $uri, filenames => \@filenames } );
85 }
86
87 undef $extract_progress;
88
89 my $fetch_progress = Term::ProgressBar::Simple->new(
90     {   count => scalar( keys %uris ),
91         name  => 'Fetching URIs',
92     }
93 );
94
95 my %filenames;
96 while ( $bw->pending() ) {
97     my $response   = $bw->get_result();
98     my $uri        = $response->{uri};
99     my @filenames  = @{ $response->{filenames} };
100     my $is_success = $response->{is_success};
101     my $message    = $response->{message};
102
103     unless ($is_success) {
104         foreach my $filename (@filenames) {
105             push @{ $filenames{$filename} },
106                 { uri => $uri, message => $message };
107         }
108     }
109     $fetch_progress++;
110 }
111 $bw->shut_down();
112
113 my $fh = IO::File->new('> uris.txt');
114 foreach my $filename ( sort keys %filenames ) {
115     $fh->say("* $filename");
116     my @bits = @{ $filenames{$filename} };
117     foreach my $bit (@bits) {
118         my $uri     = $bit->{uri};
119         my $message = $bit->{message};
120         $fh->say("  $uri");
121         $fh->say("    $message");
122     }
123 }
124 $fh->close;
125
126 say 'Finished, see uris.txt';
127
128 sub work_alarmed {
129     my $conf = shift;
130     eval {
131         local $SIG{ALRM} = sub { die "alarm\n" };    # NB: \n required
132         alarm 60;
133         $conf = work($conf);
134         alarm 0;
135     };
136     if ($@) {
137         $conf->{is_success} = 0;
138         $conf->{message}    = 'Timed out';
139
140     }
141     return $conf;
142 }
143
144 sub work {
145     my $conf      = shift;
146     my $uri       = $conf->{uri};
147     my @filenames = @{ $conf->{filenames} };
148
149     if ( $uri =~ /^http/ ) {
150         my $uri_without_fragment = URI->new($uri);
151         my $fragment             = $uri_without_fragment->fragment(undef);
152         my $response             = $ua->head($uri_without_fragment);
153
154         $conf->{is_success} = $response->is_success;
155         $conf->{message}    = $response->status_line;
156         return $conf;
157     } else {
158
159         my $uri_object = URI->new($uri);
160         my $host       = $uri_object->host;
161         my $path       = $uri_object->path;
162         my ( $volume, $directories, $filename )
163             = File::Spec->splitpath($path);
164
165         my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
166         unless ($ftp) {
167             $conf->{is_succcess} = 0;
168             $conf->{message}     = "Can not connect to $host: $@";
169             return $conf;
170         }
171
172         my $can_login = $ftp->login( "anonymous", '-anonymous@' );
173         unless ($can_login) {
174             $conf->{is_success} = 0;
175             $conf->{message} = "Can not login ", $ftp->message;
176             return $conf;
177         }
178
179         my $can_binary = $ftp->binary();
180         unless ($can_binary) {
181             $conf->{is_success} = 0;
182             $conf->{message} = "Can not binary ", $ftp->message;
183             return $conf;
184         }
185
186         my $can_cwd = $ftp->cwd($directories);
187         unless ($can_cwd) {
188             $conf->{is_success} = 0;
189             $conf->{message} = "Can not cwd to $directories ", $ftp->message;
190             return $conf;
191         }
192
193         if ($filename) {
194             my $can_size = $ftp->size($filename);
195             unless ($can_size) {
196                 $conf->{is_success} = 0;
197                 $conf->{message}
198                     = "Can not size $filename in $directories",
199                     $ftp->message;
200                 return $conf;
201             }
202         } else {
203             my ($can_dir) = $ftp->dir;
204             unless ($can_dir) {
205                 my ($can_ls) = $ftp->ls;
206                 unless ($can_ls) {
207                     $conf->{is_success} = 0;
208                     $conf->{message}
209                         = "Can not dir or ls in $directories ",
210                         $ftp->message;
211                     return $conf;
212                 }
213             }
214         }
215
216         $conf->{is_success} = 1;
217         return $conf;
218     }
219 }
220
221 __DATA__
222 # these are fine but give errors
223 ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
224 ftp://ftp.stratus.com/pub/vos/utility/utility.html
225
226 # these are missing, sigh
227 ftp://ftp.sco.com/SLS/ptf7051e.Z
228 http://perlmonks.thepen.com/42898.html
229 http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
230 http://public.activestate.com/cgi-bin/perlbrowse
231 http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
232 http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
233 http://my.smithmicro.com/mac/stuffit/
234 http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
235 http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
236 http://www.openzaurus.org/
237 http://Casbah.org/
238 http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe
239 http://www.pvhp.com/~pvhp/
240 http://www.pvhp.com/%7Epvhp/
241 http://www.pvhp.com/%7epvhp/
242 http://www.leo.org
243 http://www.madgoat.com
244 http://www.mks.com/s390/gnu/
245 http://www.tpj.com/
246 http://safaribooksonline.com/
247 http://use.perl.org/~autrijus/journal/25768
248 http://www.s390.ibm.com/products/oe/bpxqp1.html
249 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01396.html
250 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01489.html
251 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01491.html
252 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01608.html
253 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02144.html
254 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02998.html
255 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1999-03/msg00520.html
256 http://www.w3.org/Security/Faq/
257
258 # these are URI extraction bugs
259 http://www.perl.org/E
260 http://en.wikipedia.org/wiki/SREC_(file_format
261 http://somewhere.else',-type=/
262 ftp:passive-mode
263 ftp:
264 http:[-
265 http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
266 http://www.xray.mpe.mpg.de/mailing-lists/perl5-
267 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
268 http://perl.come/
269 http://www.perl.come/
270
271 # these are used as an example
272 http://example.com/
273 http://something.here/
274 http://users.perl5.git.perl.org/~yourlogin/
275 http://github.com/USERNAME/perl/tree/orange
276 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
277 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
278 http://somewhere.else$/
279 http://somewhere.else$/
280 http://somewhere.else/bin/foo&bar',-Type=
281 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
282 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
283 http://www.perl.org/test.cgi
284 http://cpan2.local/
285 http://search.cpan.org/perldoc?
286 http://cpan1.local/
287 http://cpan.dev.local/CPAN
288 http:///
289 ftp://
290 ftp://myurl/
291 ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
292 http://www14.software.ibm.com/webapp/download/downloadaz.jsp
293 http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
294 http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
295 http://localhost/tmp/index.txt
296 http://example.com/foo/bar.html
297 http://example.com/Text-Bastardize-1.06.tar.gz
298 ftp://example.com/sources/packages.txt
299 http://example.com/sources/packages.txt
300 http://example.com/sources
301 ftp://example.com/sources
302 http://some.where.com/dir/file.txt
303 http://some.where.com/dir/a.txt
304 http://foo.com/X.tgz
305 ftp://foo.com/X.tgz
306 http://foo/
307 http://www.foo.com:8000/
308 http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
309 http://decoded/mirror/path
310 http://a/b/c/d/e/f/g/h/i/j
311 http://foo/bar.gz
312 ftp://ftp.perl.org
313 http://purl.org/rss/1.0/modules/taxonomy/
314 ftp://ftp.sun.ac.za/CPAN/CPAN/
315 ftp://ftp.cpan.org/pub/mirror/index.txt
316 ftp://cpan.org/pub/mirror/index.txt
317 http://example.com/~eh/
318 http://plagger.org/.../rss
319 http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
320 http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
321 http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
322 http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
323 http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
324 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip  
325 http://module-build.sourceforge.net/META-spec-new.html
326 http://module-build.sourceforge.net/META-spec-v1.4.html
327 http://www.cs.vu.nl/~tmgil/vi.html
328 http://perlcomposer.sourceforge.net/vperl.html
329 http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
330 http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
331 http://world.std.com/~aep/ptkdb/
332 http://www.castlelink.co.uk/object_system/
333 http://www.fh-wedel.de/elvis/
334 ftp://ftp.blarg.net/users/amol/zsh/
335 ftp://ftp.funet.fi/pub/languages/perl/CPAN
336 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
337 http://users.perl5.git.perl.org/~USERNAME
338 http://foo/x//y/script.cgi/a//b
339 http://xxx/script.cgi/http://foo
340 http://foo/./x//z/script.cgi/a/../b//c
341 http://somewhere.else/in/movie/land
342 http://somewhere.else/finished.html
343 http://somewhere.else/bin/foo&bar$
344 http://somewhere.else/
345 http://proxy:8484/
346 http://proxy/
347 http://myrepo.example.com/
348 http://remote/source
349 https://example.com/
350 http://example.com:1024/
351 http:///path?foo=bar
352 http://[::]:1024/
353 http://([/
354 http://example.com:9000/index.html
355 http://proxy.example.com:8080/
356 http:///index.html
357 http://[www.json::pp.org]/
358 http://localhost/
359 http://foo.example.com/
360 http://abc.com/a.js
361 http://whatever/man/1/crontab
362 http://abc.com/c.js
363 http://whatever/Foo%3A%3ABar
364 http://abc.com/b.js
365 http://remote.server.com/jquery.css
366 http://some.other.com/page.html
367 https://text.com/1/2
368 https://text.com/1/2
369 http://link.included.here?o=1&p=2
370 http://link.included.here?o=1&amp;p=2
371 http://link.included.here?o=1&amp;p=2
372 http://link.included.here/
373 http://foo/x//y/script.cgi/a//b
374 http://xxx/script.cgi/http://foo
375 http://foo/./x//z/script.cgi/a/../b//c
376 http://somewhere.else/in/movie/land
377 http://somewhere.else/finished.html
378 http://webproxy:3128/
379 http://www/
380
381 # these are used to generate or match URLs
382 http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
383 http://www.cpantesters.org/show/%s.yaml
384 ftp://(.*?)/(.*)/(.*
385 ftp://(.*?)/(.*)/(.*
386 ftp://(.*?)/(.*)/(.*
387 ftp://ftp.foo.bar/
388 http://$host/
389 http://wwwe%3C46/
390 ftp:/
391 http://$addr/mark?commit=$
392 http://search.cpan.org/~
393 http:/
394 ftp:%5Cn$url
395 http://www.ietf.org/rfc/rfc$2.txt
396 http://search.cpan.org/~
397 ftp:%5Cn$url
398
399 # weird redirects that LWP doesn't like
400 http://www.theperlreview.com/community_calendar
401 http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
402 http://sunsolve.sun.com
403
404 # broken webserver that doesn't like HEAD requests
405 http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
406 http://www.w3.org/TR/html4/loose.dtd
407
408 # these have been reported upstream to CPAN authors
409 http://www.gnu.org/manual/tar/html_node/tar_139.html
410 http://www.w3.org/pub/WWW/TR/Wd-css-1.html
411 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
412 http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
413 http://search.cpan.org/search?query=Module::Build::Convert
414 http://www.refcnt.org/papers/module-build-convert
415 http://csrc.nist.gov/cryptval/shs.html
416 http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
417 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
418 http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
419 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
420 http://rfc.net/rfc2781.html
421 http://www.icu-project.org/charset/
422 http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
423 http://www.rfc-editor.org/
424 http://www.rfc.net/
425 http://www.oreilly.com/people/authors/lunde/cjk_inf.html
426 http://www.oreilly.com/catalog/cjkvinfo/
427 http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
428 http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
429 http://www.egt.ie/standards/iso3166/iso3166-1-en.html
430 http://www.bsi-global.com/iso4217currency
431 http://www.plover.com/~mjd/perl/Memoize/
432 http://www.plover.com/~mjd/perl/MiniMemoize/
433 http://www.sysadminmag.com/tpj/issues/vol5_5/
434 ftp://ftp.tpc.int/tpc/server/UNIX/
435 http://www.nara.gov/genealogy/
436 http://home.utah-inter.net/kinsearch/Soundex.html
437 http://www.nara.gov/genealogy/soundex/soundex.html
438 http://rfc.net/rfc3461.html
439 ftp://ftp.cs.pdx.edu/pub/elvis/
440 http://www.fh-wedel.de/elvis/
441 http://lists.perl.org/list/perl-mvs.html
442 http://www.cpan.org/ports/os2/
443 http://github.com/dagolden/cpan-meta-spec
444 http://github.com/dagolden/cpan-meta-spec/issues
445 http://www.opensource.org/licenses/lgpl-license.phpt
446 http://reality.sgi.com/ariel
447 http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
448 http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
449 http://www.nsrl.nist.gov/testdata/
450 http://public.activestate.com/cgi-bin/perlbrowse/p/31194
451 http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
452 http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
453 http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
454 http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
455 http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
456 http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
457 http://github.com/schwern/extutils-makemaker
458 https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
459 http://www.json.org/JSON::PP_checker/
460 ftp://ftp.kiae.su/pub/unix/fido/
461 http://www.gallistel.net/nparker/weather/code/
462 http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
463 ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
464 http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
465 http://public.activestate.com/cgi-bin/perlbrowse/p/33567
466 http://public.activestate.com/cgi-bin/perlbrowse/p/33566
467 http://www.dsmit.com/cons/
468 http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
469
470 __END__
471
472 =head1 NAME
473
474 checkURL.pl - Check that all the URLs in the Perl source are valid
475
476 =head1 DESCRIPTION
477
478 This program checks that all the URLs in the Perl source are valid. It
479 checks HTTP and FTP links in parallel and contains a list of known
480 bad example links in its source. It takes 4 minutes to run on my
481 machine. The results are written to 'uris.txt' and list the filename,
482 the URL and the error:
483
484   * ext/Locale-Maketext/lib/Locale/Maketext.pod
485     http://sunsite.dk/RFC/rfc/rfc2277.html
486       404 Not Found
487   ...
488
489 It should be run every so often and links fixed and upstream authors
490 notified.
491
492 Note that the web is unstable and some websites are temporarily down.