6 require File::Find::Rule;
9 require IO::Socket::SSL;
10 use List::Util qw(sum);
11 require LWP::UserAgent;
13 require Parallel::Fork::BossWorkerAsync;
14 require Term::ProgressBar::Simple;
15 require URI::Find::Simple;
19 while ( my $line = <main::DATA> ) {
21 next if $line =~ /^#/;
26 my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
30 my @filenames = @ARGV;
31 @filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
34 my $total_bytes = sum map {-s} @filenames;
36 my $extract_progress = Term::ProgressBar::Simple->new(
37 { count => $total_bytes,
38 name => 'Extracting URIs',
43 foreach my $filename (@filenames) {
44 next if $filename =~ /uris\.txt/;
45 next if $filename =~ /check_uris/;
46 next if $filename =~ /\.patch$/;
47 next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48 next if $filename =~ /checkURL\.pl$/;
49 my $contents = File::Slurp::read_file($filename);
50 my @uris = URI::Find::Simple::list_uris($contents);
51 foreach my $uri (@uris) {
52 next unless $uri =~ /^(http|ftp)/;
53 next if $ignore{$uri};
55 # no need to hit rt.perl.org
57 if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
59 # no need to hit github
61 if $uri =~ m{^https?://(?:www\.)?github\.com/[pP]erl/perl5/issues/\d+$};
63 # no need to hit rt.cpan.org
65 if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
67 # no need to hit google groups (weird redirect LWP does not like)
69 if $uri =~ m{^http://groups\.google\.com/};
71 push @{ $uris{$uri} }, $filename;
73 $extract_progress += -s $filename;
76 my $bw = Parallel::Fork::BossWorkerAsync->new(
77 work_handler => \&work_alarmed,
78 global_timeout => 120,
82 foreach my $uri ( keys %uris ) {
83 my @filenames = @{ $uris{$uri} };
84 $bw->add_work( { uri => $uri, filenames => \@filenames } );
87 undef $extract_progress;
89 my $fetch_progress = Term::ProgressBar::Simple->new(
90 { count => scalar( keys %uris ),
91 name => 'Fetching URIs',
96 while ( $bw->pending() ) {
97 my $response = $bw->get_result();
98 my $uri = $response->{uri};
99 my @filenames = @{ $response->{filenames} };
100 my $is_success = $response->{is_success};
101 my $message = $response->{message};
103 unless ($is_success) {
104 foreach my $filename (@filenames) {
105 push @{ $filenames{$filename} },
106 { uri => $uri, message => $message };
113 my $fh = IO::File->new('> uris.txt');
114 foreach my $filename ( sort keys %filenames ) {
115 $fh->say("* $filename");
116 my @bits = @{ $filenames{$filename} };
117 foreach my $bit (@bits) {
118 my $uri = $bit->{uri};
119 my $message = $bit->{message};
121 $fh->say(" $message");
126 say 'Finished, see uris.txt';
131 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required
137 $conf->{is_success} = 0;
138 $conf->{message} = 'Timed out';
146 my $uri = $conf->{uri};
147 my @filenames = @{ $conf->{filenames} };
149 if ( $uri =~ /^http/ ) {
150 my $uri_without_fragment = URI->new($uri);
151 my $fragment = $uri_without_fragment->fragment(undef);
152 my $response = $ua->head($uri_without_fragment);
154 $conf->{is_success} = $response->is_success;
155 $conf->{message} = $response->status_line;
159 my $uri_object = URI->new($uri);
160 my $host = $uri_object->host;
161 my $path = $uri_object->path;
162 my ( $volume, $directories, $filename )
163 = File::Spec->splitpath($path);
165 my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
167 $conf->{is_succcess} = 0;
168 $conf->{message} = "Can not connect to $host: $@";
172 my $can_login = $ftp->login( "anonymous", '-anonymous@' );
173 unless ($can_login) {
174 $conf->{is_success} = 0;
175 $conf->{message} = "Can not login ", $ftp->message;
179 my $can_binary = $ftp->binary();
180 unless ($can_binary) {
181 $conf->{is_success} = 0;
182 $conf->{message} = "Can not binary ", $ftp->message;
186 my $can_cwd = $ftp->cwd($directories);
188 $conf->{is_success} = 0;
189 $conf->{message} = "Can not cwd to $directories ", $ftp->message;
194 my $can_size = $ftp->size($filename);
196 $conf->{is_success} = 0;
198 = "Can not size $filename in $directories",
203 my ($can_dir) = $ftp->dir;
205 my ($can_ls) = $ftp->ls;
207 $conf->{is_success} = 0;
209 = "Can not dir or ls in $directories ",
216 $conf->{is_success} = 1;
222 # these are fine but give errors
223 ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
224 ftp://ftp.stratus.com/pub/vos/utility/utility.html
226 # these are missing, sigh
227 ftp://ftp.sco.com/SLS/ptf7051e.Z
228 http://perlmonks.thepen.com/42898.html
229 http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
230 http://public.activestate.com/cgi-bin/perlbrowse
231 http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
232 http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
233 http://my.smithmicro.com/mac/stuffit/
234 http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
235 http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
236 http://www.openzaurus.org/
238 http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe
239 http://www.pvhp.com/~pvhp/
240 http://www.pvhp.com/%7Epvhp/
241 http://www.pvhp.com/%7epvhp/
243 http://www.madgoat.com
244 http://www.mks.com/s390/gnu/
245 http://www.research.att.com/sw/tools/uwin/
247 http://safaribooksonline.com/
248 http://use.perl.org/~autrijus/journal/25768
249 http://www.s390.ibm.com/products/oe/bpxqp1.html
250 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01396.html
251 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01489.html
252 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01491.html
253 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01608.html
254 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02144.html
255 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02998.html
256 http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1999-03/msg00520.html
257 http://www.w3.org/Security/Faq/
259 # these are URI extraction bugs
260 http://www.perl.org/E
261 http://en.wikipedia.org/wiki/SREC_(file_format
262 http://somewhere.else',-type=/
266 http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
267 http://www.xray.mpe.mpg.de/mailing-lists/perl5-
268 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
270 http://www.perl.come/
272 # these are used as an example
274 http://something.here/
275 http://users.perl5.git.perl.org/~yourlogin/
276 http://github.com/USERNAME/perl/tree/orange
277 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
278 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
279 http://somewhere.else$/
280 http://somewhere.else$/
281 http://somewhere.else/bin/foo&bar',-Type=
282 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
283 http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
284 http://www.perl.org/test.cgi
286 http://search.cpan.org/perldoc?
288 http://cpan.dev.local/CPAN
292 ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
293 http://www14.software.ibm.com/webapp/download/downloadaz.jsp
294 http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
295 http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
296 http://localhost/tmp/index.txt
297 http://example.com/foo/bar.html
298 http://example.com/Text-Bastardize-1.06.tar.gz
299 ftp://example.com/sources/packages.txt
300 http://example.com/sources/packages.txt
301 http://example.com/sources
302 ftp://example.com/sources
303 http://some.where.com/dir/file.txt
304 http://some.where.com/dir/a.txt
308 http://www.foo.com:8000/
309 http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
310 http://decoded/mirror/path
311 http://a/b/c/d/e/f/g/h/i/j
314 http://purl.org/rss/1.0/modules/taxonomy/
315 ftp://ftp.sun.ac.za/CPAN/CPAN/
316 ftp://ftp.cpan.org/pub/mirror/index.txt
317 ftp://cpan.org/pub/mirror/index.txt
318 http://example.com/~eh/
319 http://plagger.org/.../rss
320 http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
321 http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
322 http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
323 http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
324 http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
325 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
326 http://module-build.sourceforge.net/META-spec-new.html
327 http://module-build.sourceforge.net/META-spec-v1.4.html
328 http://www.cs.vu.nl/~tmgil/vi.html
329 http://perlcomposer.sourceforge.net/vperl.html
330 http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
331 http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
332 http://world.std.com/~aep/ptkdb/
333 http://www.castlelink.co.uk/object_system/
334 http://www.fh-wedel.de/elvis/
335 ftp://ftp.blarg.net/users/amol/zsh/
336 ftp://ftp.funet.fi/pub/languages/perl/CPAN
337 http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
338 http://users.perl5.git.perl.org/~USERNAME
339 http://foo/x//y/script.cgi/a//b
340 http://xxx/script.cgi/http://foo
341 http://foo/./x//z/script.cgi/a/../b//c
342 http://somewhere.else/in/movie/land
343 http://somewhere.else/finished.html
344 http://somewhere.else/bin/foo&bar$
345 http://somewhere.else/
348 http://myrepo.example.com/
351 http://example.com:1024/
355 http://example.com:9000/index.html
356 http://proxy.example.com:8080/
358 http://[www.json::pp.org]/
360 http://foo.example.com/
362 http://whatever/man/1/crontab
364 http://whatever/Foo%3A%3ABar
366 http://remote.server.com/jquery.css
367 http://some.other.com/page.html
370 http://link.included.here?o=1&p=2
371 http://link.included.here?o=1&p=2
372 http://link.included.here?o=1&p=2
373 http://link.included.here/
374 http://foo/x//y/script.cgi/a//b
375 http://xxx/script.cgi/http://foo
376 http://foo/./x//z/script.cgi/a/../b//c
377 http://somewhere.else/in/movie/land
378 http://somewhere.else/finished.html
379 http://webproxy:3128/
382 # these are used to generate or match URLs
383 http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
384 http://www.cpantesters.org/show/%s.yaml
392 http://$addr/mark?commit=$
393 http://search.cpan.org/~
396 http://www.ietf.org/rfc/rfc$2.txt
397 http://search.cpan.org/~
400 # weird redirects that LWP doesn't like
401 http://www.theperlreview.com/community_calendar
402 http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
403 http://sunsolve.sun.com
405 # broken webserver that doesn't like HEAD requests
406 http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
407 http://www.w3.org/TR/html4/loose.dtd
409 # these have been reported upstream to CPAN authors
410 http://www.gnu.org/manual/tar/html_node/tar_139.html
411 http://www.w3.org/pub/WWW/TR/Wd-css-1.html
412 http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
413 http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
414 http://search.cpan.org/search?query=Module::Build::Convert
415 http://www.refcnt.org/papers/module-build-convert
416 http://csrc.nist.gov/cryptval/shs.html
417 http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
418 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
419 http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
420 http://www.debian.or.jp/~kubota/unicode-symbols.html.en
421 http://rfc.net/rfc2781.html
422 http://www.icu-project.org/charset/
423 http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
424 http://www.rfc-editor.org/
426 http://www.oreilly.com/people/authors/lunde/cjk_inf.html
427 http://www.oreilly.com/catalog/cjkvinfo/
428 http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
429 http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
430 http://www.egt.ie/standards/iso3166/iso3166-1-en.html
431 http://www.bsi-global.com/iso4217currency
432 http://www.plover.com/~mjd/perl/Memoize/
433 http://www.plover.com/~mjd/perl/MiniMemoize/
434 http://www.sysadminmag.com/tpj/issues/vol5_5/
435 ftp://ftp.tpc.int/tpc/server/UNIX/
436 http://www.nara.gov/genealogy/
437 http://home.utah-inter.net/kinsearch/Soundex.html
438 http://www.nara.gov/genealogy/soundex/soundex.html
439 http://rfc.net/rfc3461.html
440 ftp://ftp.cs.pdx.edu/pub/elvis/
441 http://www.fh-wedel.de/elvis/
442 http://lists.perl.org/list/perl-mvs.html
443 http://www.cpan.org/ports/os2/
444 http://github.com/dagolden/cpan-meta-spec
445 http://github.com/dagolden/cpan-meta-spec/issues
446 http://www.opensource.org/licenses/lgpl-license.phpt
447 http://reality.sgi.com/ariel
448 http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
449 http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
450 http://www.nsrl.nist.gov/testdata/
451 http://public.activestate.com/cgi-bin/perlbrowse/p/31194
452 http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
453 http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
454 http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
455 http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
456 http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
457 http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
458 http://github.com/schwern/extutils-makemaker
459 https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
460 http://www.json.org/JSON::PP_checker/
461 ftp://ftp.kiae.su/pub/unix/fido/
462 http://www.gallistel.net/nparker/weather/code/
463 http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
464 ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
465 http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
466 http://public.activestate.com/cgi-bin/perlbrowse/p/33567
467 http://public.activestate.com/cgi-bin/perlbrowse/p/33566
468 http://www.dsmit.com/cons/
469 http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
475 checkURL.pl - Check that all the URLs in the Perl source are valid
479 This program checks that all the URLs in the Perl source are valid. It
480 checks HTTP and FTP links in parallel and contains a list of known
481 bad example links in its source. It takes 4 minutes to run on my
482 machine. The results are written to 'uris.txt' and list the filename,
483 the URL and the error:
485 * ext/Locale-Maketext/lib/Locale/Maketext.pod
486 http://sunsite.dk/RFC/rfc/rfc2277.html
490 It should be run every so often and links fixed and upstream authors
493 Note that the web is unstable and some websites are temporarily down.