6 require File::Find::Rule;
9 require IO::Socket::SSL;
10 use List::Util qw(sum);
11 require LWP::UserAgent;
13 require Parallel::Fork::BossWorkerAsync;
14 require Term::ProgressBar::Simple;
15 require URI::Find::Simple;
19 while ( my $line = <main::DATA> ) {
21 next if $line =~ /^#/;
26 my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
30 my @filenames = @ARGV;
31 @filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
34 my $total_bytes = sum map {-s} @filenames;
36 my $extract_progress = Term::ProgressBar::Simple->new(
37 { count => $total_bytes,
38 name => 'Extracting URIs',
43 foreach my $filename (@filenames) {
44 next if $filename =~ /uris\.txt/;
45 next if $filename =~ /check_uris/;
46 next if $filename =~ /\.patch$/;
47 next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48 next if $filename =~ /checkURL\.pl$/;
49 my $contents = File::Slurp::read_file($filename);
50 my @uris = URI::Find::Simple::list_uris($contents);
51 foreach my $uri (@uris) {
52 next unless $uri =~ /^(http|ftp)/;
53 next if $ignore{$uri};
55 # no need to hit rt.perl.org
57 if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
59 # no need to hit github
61 if $uri =~ m{^https?://(?:www\.)?github\.com/[pP]erl/perl5/issues/\d+$};
63 # no need to hit rt.cpan.org
65 if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
67 # no need to hit google groups (weird redirect LWP does not like)
69 if $uri =~ m{^http://groups\.google\.com/};
71 push @{ $uris{$uri} }, $filename;
73 $extract_progress += -s $filename;
76 my $bw = Parallel::Fork::BossWorkerAsync->new(
77 work_handler => \&work_alarmed,
78 global_timeout => 120,
82 foreach my $uri ( keys %uris ) {
83 my @filenames = @{ $uris{$uri} };
84 $bw->add_work( { uri => $uri, filenames => \@filenames } );
87 undef $extract_progress;
89 my $fetch_progress = Term::ProgressBar::Simple->new(
90 { count => scalar( keys %uris ),
91 name => 'Fetching URIs',
96 while ( $bw->pending() ) {
97 my $response = $bw->get_result();
98 my $uri = $response->{uri};
99 my @filenames = @{ $response->{filenames} };
100 my $is_success = $response->{is_success};
101 my $message = $response->{message};
103 unless ($is_success) {
104 foreach my $filename (@filenames) {
105 push @{ $filenames{$filename} },
106 { uri => $uri, message => $message };
113 my $fh = IO::File->new('> uris.txt');
114 foreach my $filename ( sort keys %filenames ) {
115 $fh->say("* $filename");
116 my @bits = @{ $filenames{$filename} };
117 foreach my $bit (@bits) {
118 my $uri = $bit->{uri};
119 my $message = $bit->{message};
121 $fh->say(" $message");
126 say 'Finished, see uris.txt';
131 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
137 $conf->{is_success} = 0;
138 $conf->{message} = 'Timed out';
146 my $uri = $conf->{uri};
147 my @filenames = @{ $conf->{filenames} };
149 if ( $uri =~ /^http/ ) {
150 my $uri_without_fragment = URI->new($uri);
151 my $fragment = $uri_without_fragment->fragment(undef);
152 my $response = $ua->head($uri_without_fragment);
154 $conf->{is_success} = $response->is_success;
155 $conf->{message} = $response->status_line;
159 my $uri_object = URI->new($uri);
160 my $host = $uri_object->host;
161 my $path = $uri_object->path;
162 my ( $volume, $directories, $filename )
163 = File::Spec->splitpath($path);
165 my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
167 $conf->{is_succcess} = 0;
168 $conf->{message} = "Can not connect to $host: $@";
172 my $can_login = $ftp->login( "anonymous", '-anonymous@' );
173 unless ($can_login) {
174 $conf->{is_success} = 0;
175 $conf->{message} = "Can not login ", $ftp->message;
179 my $can_binary = $ftp->binary();
180 unless ($can_binary) {
181 $conf->{is_success} = 0;
182 $conf->{message} = "Can not binary ", $ftp->message;
186 my $can_cwd = $ftp->cwd($directories);
188 $conf->{is_success} = 0;
189 $conf->{message} = "Can not cwd to $directories ", $ftp->message;
194 my $can_size = $ftp->size($filename);
196 $conf->{is_success} = 0;
198 = "Can not size $filename in $directories",
203 my ($can_dir) = $ftp->dir;
205 my ($can_ls) = $ftp->ls;
207 $conf->{is_success} = 0;
209 = "Can not dir or ls in $directories ",
216 $conf->{is_success} = 1;
222 # these are fine but give errors
223 ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
224 ftp://ftp.stratus.com/pub/vos/utility/utility.html
226 # these are missing, sigh
227 ftp://ftp.sco.com/SLS/ptf7051e.Z
228 http://perlmonks.thepen.com/42898.html
229 http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
230 http://public.activestate.com/cgi-bin/perlbrowse
231 http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
232 http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
233 http://my.smithmicro.com/mac/stuffit/
234 http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
235 http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
236 http://www.openzaurus.org/
238 http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe
239 http://www.pvhp.com/~pvhp/
240 http://www.pvhp.com/%7Epvhp/
241 http://www.pvhp.com/%7epvhp/
243 http://www.madgoat.com
244 http://www.mks.com/s390/gnu/
246 http://safaribooksonline.com/
247 http://use.perl.org/~autrijus/journal/25768
248 http://www.s390.ibm.com/products/oe/bpxqp1.html
249 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01396.html
250 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01489.html
251 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01491.html
252 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01608.html
253 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02144.html
254 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02998.html
255 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1999-03/msg00520.html
256 http://www.w3.org/Security/Faq/
258 # these are URI extraction bugs
259 http://www.perl.org/E
260 http://en.wikipedia.org/wiki/SREC_(file_format
261 http://somewhere.else',-type=/
265 http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
266 http://www.xray.mpe.mpg.de/mailing-lists/perl5-
267 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
269 http://www.perl.come/
271 # these are used as an example
273 http://something.here/
274 http://users.perl5.git.perl.org/~yourlogin/
275 http://github.com/USERNAME/perl/tree/orange
276 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
277 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
278 http://somewhere.else$/
279 http://somewhere.else$/
280 http://somewhere.else/bin/foo&bar',-Type=
281 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
282 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
283 http://www.perl.org/test.cgi
285 http://search.cpan.org/perldoc?
287 http://cpan.dev.local/CPAN
291 ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
292 http://www14.software.ibm.com/webapp/download/downloadaz.jsp
293 http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
294 http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
295 http://localhost/tmp/index.txt
296 http://example.com/foo/bar.html
297 http://example.com/Text-Bastardize-1.06.tar.gz
298 ftp://example.com/sources/packages.txt
299 http://example.com/sources/packages.txt
300 http://example.com/sources
301 ftp://example.com/sources
302 http://some.where.com/dir/file.txt
303 http://some.where.com/dir/a.txt
307 http://www.foo.com:8000/
308 http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
309 http://decoded/mirror/path
310 http://a/b/c/d/e/f/g/h/i/j
313 http://purl.org/rss/1.0/modules/taxonomy/
314 ftp://ftp.sun.ac.za/CPAN/CPAN/
315 ftp://ftp.cpan.org/pub/mirror/index.txt
316 ftp://cpan.org/pub/mirror/index.txt
317 http://example.com/~eh/
318 http://plagger.org/.../rss
319 http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
320 http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
321 http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
322 http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
323 http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
324 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
325 http://module-build.sourceforge.net/META-spec-new.html
326 http://module-build.sourceforge.net/META-spec-v1.4.html
327 http://www.cs.vu.nl/~tmgil/vi.html
328 http://perlcomposer.sourceforge.net/vperl.html
329 http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
330 http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
331 http://world.std.com/~aep/ptkdb/
332 http://www.castlelink.co.uk/object_system/
333 http://www.fh-wedel.de/elvis/
334 ftp://ftp.blarg.net/users/amol/zsh/
335 ftp://ftp.funet.fi/pub/languages/perl/CPAN
336 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
337 http://users.perl5.git.perl.org/~USERNAME
338 http://foo/x//y/script.cgi/a//b
339 http://xxx/script.cgi/http://foo
340 http://foo/./x//z/script.cgi/a/../b//c
341 http://somewhere.else/in/movie/land
342 http://somewhere.else/finished.html
343 http://somewhere.else/bin/foo&bar$
344 http://somewhere.else/
347 http://myrepo.example.com/
350 http://example.com:1024/
354 http://example.com:9000/index.html
355 http://proxy.example.com:8080/
357 http://[www.json::pp.org]/
359 http://foo.example.com/
361 http://whatever/man/1/crontab
363 http://whatever/Foo%3A%3ABar
365 http://remote.server.com/jquery.css
366 http://some.other.com/page.html
369 http://link.included.here?o=1&p=2
370 http://link.included.here?o=1&p=2
371 http://link.included.here?o=1&p=2
372 http://link.included.here/
373 http://foo/x//y/script.cgi/a//b
374 http://xxx/script.cgi/http://foo
375 http://foo/./x//z/script.cgi/a/../b//c
376 http://somewhere.else/in/movie/land
377 http://somewhere.else/finished.html
378 http://webproxy:3128/
381 # these are used to generate or match URLs
382 http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
383 http://www.cpantesters.org/show/%s.yaml
391 http://$addr/mark?commit=$
392 http://search.cpan.org/~
395 http://www.ietf.org/rfc/rfc$2.txt
396 http://search.cpan.org/~
399 # weird redirects that LWP doesn't like
400 http://www.theperlreview.com/community_calendar
401 http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
402 http://sunsolve.sun.com
404 # broken webserver that doesn't like HEAD requests
405 http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
406 http://www.w3.org/TR/html4/loose.dtd
408 # these have been reported upstream to CPAN authors
409 http://www.gnu.org/manual/tar/html_node/tar_139.html
410 http://www.w3.org/pub/WWW/TR/Wd-css-1.html
411 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
412 http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
413 http://search.cpan.org/search?query=Module::Build::Convert
414 http://www.refcnt.org/papers/module-build-convert
415 http://csrc.nist.gov/cryptval/shs.html
416 http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
417 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
418 http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
419 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
420 http://rfc.net/rfc2781.html
421 http://www.icu-project.org/charset/
422 http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
423 http://www.rfc-editor.org/
425 http://www.oreilly.com/people/authors/lunde/cjk_inf.html
426 http://www.oreilly.com/catalog/cjkvinfo/
427 http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
428 http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
429 http://www.egt.ie/standards/iso3166/iso3166-1-en.html
430 http://www.bsi-global.com/iso4217currency
431 http://www.plover.com/~mjd/perl/Memoize/
432 http://www.plover.com/~mjd/perl/MiniMemoize/
433 http://www.sysadminmag.com/tpj/issues/vol5_5/
434 ftp://ftp.tpc.int/tpc/server/UNIX/
435 http://www.nara.gov/genealogy/
436 http://home.utah-inter.net/kinsearch/Soundex.html
437 http://www.nara.gov/genealogy/soundex/soundex.html
438 http://rfc.net/rfc3461.html
439 ftp://ftp.cs.pdx.edu/pub/elvis/
440 http://www.fh-wedel.de/elvis/
441 http://lists.perl.org/list/perl-mvs.html
442 http://www.cpan.org/ports/os2/
443 http://github.com/dagolden/cpan-meta-spec
444 http://github.com/dagolden/cpan-meta-spec/issues
445 http://www.opensource.org/licenses/lgpl-license.phpt
446 http://reality.sgi.com/ariel
447 http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
448 http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
449 http://www.nsrl.nist.gov/testdata/
450 http://public.activestate.com/cgi-bin/perlbrowse/p/31194
451 http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
452 http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
453 http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
454 http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
455 http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
456 http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
457 http://github.com/schwern/extutils-makemaker
458 https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
459 http://www.json.org/JSON::PP_checker/
460 ftp://ftp.kiae.su/pub/unix/fido/
461 http://www.gallistel.net/nparker/weather/code/
462 http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
463 ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
464 http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
465 http://public.activestate.com/cgi-bin/perlbrowse/p/33567
466 http://public.activestate.com/cgi-bin/perlbrowse/p/33566
467 http://www.dsmit.com/cons/
468 http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
474 checkURL.pl - Check that all the URLs in the Perl source are valid
478 This program checks that all the URLs in the Perl source are valid. It
479 checks HTTP and FTP links in parallel and contains a list of known
480 bad example links in its source. It takes 4 minutes to run on my
481 machine. The results are written to 'uris.txt' and list the filename,
482 the URL and the error:
484 * ext/Locale-Maketext/lib/Locale/Maketext.pod
485 http://sunsite.dk/RFC/rfc/rfc2277.html
489 It should be run every so often and links fixed and upstream authors
492 Note that the web is unstable and some websites are temporarily down.