| 1 | #!perl |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use autodie; |
| 5 | use feature qw(say); |
| 6 | require File::Find::Rule; |
| 7 | require File::Slurp; |
| 8 | require File::Spec; |
| 9 | require IO::Socket::SSL; |
| 10 | use List::Util qw(sum); |
| 11 | require LWP::UserAgent; |
| 12 | require Net::FTP; |
| 13 | require Parallel::Fork::BossWorkerAsync; |
| 14 | require Term::ProgressBar::Simple; |
| 15 | require URI::Find::Simple; |
| 16 | $| = 1; |
| 17 | |
| 18 | my %ignore; |
| 19 | while ( my $line = <main::DATA> ) { |
| 20 | chomp $line; |
| 21 | next if $line =~ /^#/; |
| 22 | next unless $line; |
| 23 | $ignore{$line} = 1; |
| 24 | } |
| 25 | |
| 26 | my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 }); |
| 27 | $ua->timeout(58); |
| 28 | $ua->env_proxy; |
| 29 | |
| 30 | my @filenames = @ARGV; |
| 31 | @filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.') |
| 32 | unless @filenames; |
| 33 | |
| 34 | my $total_bytes = sum map {-s} @filenames; |
| 35 | |
| 36 | my $extract_progress = Term::ProgressBar::Simple->new( |
| 37 | { count => $total_bytes, |
| 38 | name => 'Extracting URIs', |
| 39 | } |
| 40 | ); |
| 41 | |
| 42 | my %uris; |
| 43 | foreach my $filename (@filenames) { |
| 44 | next if $filename =~ /uris\.txt/; |
| 45 | next if $filename =~ /check_uris/; |
| 46 | next if $filename =~ /\.patch$/; |
| 47 | next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod'; |
| 48 | next if $filename =~ /checkURL\.pl$/; |
| 49 | my $contents = File::Slurp::read_file($filename); |
| 50 | my @uris = URI::Find::Simple::list_uris($contents); |
| 51 | foreach my $uri (@uris) { |
| 52 | next unless $uri =~ /^(http|ftp)/; |
| 53 | next if $ignore{$uri}; |
| 54 | |
| 55 | # no need to hit rt.perl.org |
| 56 | next |
| 57 | if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$}; |
| 58 | |
| 59 | # no need to hit rt.cpan.org |
| 60 | next |
| 61 | if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$}; |
| 62 | |
| 63 | # no need to hit google groups (weird redirect LWP does not like) |
| 64 | next |
| 65 | if $uri =~ m{^http://groups\.google\.com/}; |
| 66 | |
| 67 | push @{ $uris{$uri} }, $filename; |
| 68 | } |
| 69 | $extract_progress += -s $filename; |
| 70 | } |
| 71 | |
| 72 | my $bw = Parallel::Fork::BossWorkerAsync->new( |
| 73 | work_handler => \&work_alarmed, |
| 74 | global_timeout => 120, |
| 75 | worker_count => 20, |
| 76 | ); |
| 77 | |
| 78 | foreach my $uri ( keys %uris ) { |
| 79 | my @filenames = @{ $uris{$uri} }; |
| 80 | $bw->add_work( { uri => $uri, filenames => \@filenames } ); |
| 81 | } |
| 82 | |
| 83 | undef $extract_progress; |
| 84 | |
| 85 | my $fetch_progress = Term::ProgressBar::Simple->new( |
| 86 | { count => scalar( keys %uris ), |
| 87 | name => 'Fetching URIs', |
| 88 | } |
| 89 | ); |
| 90 | |
| 91 | my %filenames; |
| 92 | while ( $bw->pending() ) { |
| 93 | my $response = $bw->get_result(); |
| 94 | my $uri = $response->{uri}; |
| 95 | my @filenames = @{ $response->{filenames} }; |
| 96 | my $is_success = $response->{is_success}; |
| 97 | my $message = $response->{message}; |
| 98 | |
| 99 | unless ($is_success) { |
| 100 | foreach my $filename (@filenames) { |
| 101 | push @{ $filenames{$filename} }, |
| 102 | { uri => $uri, message => $message }; |
| 103 | } |
| 104 | } |
| 105 | $fetch_progress++; |
| 106 | } |
| 107 | $bw->shut_down(); |
| 108 | |
| 109 | my $fh = IO::File->new('> uris.txt'); |
| 110 | foreach my $filename ( sort keys %filenames ) { |
| 111 | $fh->say("* $filename"); |
| 112 | my @bits = @{ $filenames{$filename} }; |
| 113 | foreach my $bit (@bits) { |
| 114 | my $uri = $bit->{uri}; |
| 115 | my $message = $bit->{message}; |
| 116 | $fh->say(" $uri"); |
| 117 | $fh->say(" $message"); |
| 118 | } |
| 119 | } |
| 120 | $fh->close; |
| 121 | |
| 122 | say 'Finished, see uris.txt'; |
| 123 | |
| 124 | sub work_alarmed { |
| 125 | my $conf = shift; |
| 126 | eval { |
| 127 | local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required |
| 128 | alarm 60; |
| 129 | $conf = work($conf); |
| 130 | alarm 0; |
| 131 | }; |
| 132 | if ($@) { |
| 133 | $conf->{is_success} = 0; |
| 134 | $conf->{message} = 'Timed out'; |
| 135 | |
| 136 | } |
| 137 | return $conf; |
| 138 | } |
| 139 | |
| 140 | sub work { |
| 141 | my $conf = shift; |
| 142 | my $uri = $conf->{uri}; |
| 143 | my @filenames = @{ $conf->{filenames} }; |
| 144 | |
| 145 | if ( $uri =~ /^http/ ) { |
| 146 | my $uri_without_fragment = URI->new($uri); |
| 147 | my $fragment = $uri_without_fragment->fragment(undef); |
| 148 | my $response = $ua->head($uri_without_fragment); |
| 149 | |
| 150 | $conf->{is_success} = $response->is_success; |
| 151 | $conf->{message} = $response->status_line; |
| 152 | return $conf; |
| 153 | } else { |
| 154 | |
| 155 | my $uri_object = URI->new($uri); |
| 156 | my $host = $uri_object->host; |
| 157 | my $path = $uri_object->path; |
| 158 | my ( $volume, $directories, $filename ) |
| 159 | = File::Spec->splitpath($path); |
| 160 | |
| 161 | my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 ); |
| 162 | unless ($ftp) { |
| 163 | $conf->{is_succcess} = 0; |
| 164 | $conf->{message} = "Can not connect to $host: $@"; |
| 165 | return $conf; |
| 166 | } |
| 167 | |
| 168 | my $can_login = $ftp->login( "anonymous", '-anonymous@' ); |
| 169 | unless ($can_login) { |
| 170 | $conf->{is_success} = 0; |
| 171 | $conf->{message} = "Can not login ", $ftp->message; |
| 172 | return $conf; |
| 173 | } |
| 174 | |
| 175 | my $can_binary = $ftp->binary(); |
| 176 | unless ($can_binary) { |
| 177 | $conf->{is_success} = 0; |
| 178 | $conf->{message} = "Can not binary ", $ftp->message; |
| 179 | return $conf; |
| 180 | } |
| 181 | |
| 182 | my $can_cwd = $ftp->cwd($directories); |
| 183 | unless ($can_cwd) { |
| 184 | $conf->{is_success} = 0; |
| 185 | $conf->{message} = "Can not cwd to $directories ", $ftp->message; |
| 186 | return $conf; |
| 187 | } |
| 188 | |
| 189 | if ($filename) { |
| 190 | my $can_size = $ftp->size($filename); |
| 191 | unless ($can_size) { |
| 192 | $conf->{is_success} = 0; |
| 193 | $conf->{message} |
| 194 | = "Can not size $filename in $directories", |
| 195 | $ftp->message; |
| 196 | return $conf; |
| 197 | } |
| 198 | } else { |
| 199 | my ($can_dir) = $ftp->dir; |
| 200 | unless ($can_dir) { |
| 201 | my ($can_ls) = $ftp->ls; |
| 202 | unless ($can_ls) { |
| 203 | $conf->{is_success} = 0; |
| 204 | $conf->{message} |
| 205 | = "Can not dir or ls in $directories ", |
| 206 | $ftp->message; |
| 207 | return $conf; |
| 208 | } |
| 209 | } |
| 210 | } |
| 211 | |
| 212 | $conf->{is_success} = 1; |
| 213 | return $conf; |
| 214 | } |
| 215 | } |
| 216 | |
| 217 | __DATA__ |
| 218 | # these are fine but give errors |
| 219 | ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html |
| 220 | ftp://ftp.stratus.com/pub/vos/utility/utility.html |
| 221 | |
| 222 | # these are missing, sigh |
| 223 | ftp://ftp.sco.com/SLS/ptf7051e.Z |
| 224 | http://perlmonks.thepen.com/42898.html |
| 225 | http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/ |
| 226 | http://public.activestate.com/cgi-bin/perlbrowse |
| 227 | http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog |
| 228 | http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631 |
| 229 | http://my.smithmicro.com/mac/stuffit/ |
| 230 | http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html |
| 231 | http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html |
| 232 | |
| 233 | # these are URI extraction bugs |
| 234 | http://www.perl.org/E |
| 235 | http://en.wikipedia.org/wiki/SREC_(file_format |
| 236 | http://somewhere.else',-type=/ |
| 237 | ftp:passive-mode |
| 238 | ftp: |
| 239 | http:[- |
| 240 | http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell |
| 241 | http://www.xray.mpe.mpg.de/mailing-lists/perl5- |
| 242 | http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP: |
| 243 | http://perl.come/ |
| 244 | http://www.perl.come/ |
| 245 | |
| 246 | # these are used as an example |
| 247 | http://example.com/ |
| 248 | http://something.here/ |
| 249 | http://users.perl5.git.perl.org/~yourlogin/ |
| 250 | http://github.com/USERNAME/perl/tree/orange |
| 251 | http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi |
| 252 | http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar |
| 253 | http://somewhere.else$/ |
| 254 | http://somewhere.else$/ |
| 255 | http://somewhere.else/bin/foo&bar',-Type= |
| 256 | http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi |
| 257 | http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar |
| 258 | http://www.perl.org/test.cgi |
| 259 | http://cpan2.local/ |
| 260 | http://search.cpan.org/perldoc? |
| 261 | http://cpan1.local/ |
| 262 | http://cpan.dev.local/CPAN |
| 263 | http:/// |
| 264 | ftp:// |
| 265 | ftp://myurl/ |
| 266 | ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff |
| 267 | http://www14.software.ibm.com/webapp/download/downloadaz.jsp |
| 268 | http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz |
| 269 | http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT |
| 270 | http://localhost/tmp/index.txt |
| 271 | http://example.com/foo/bar.html |
| 272 | http://example.com/Text-Bastardize-1.06.tar.gz |
| 273 | ftp://example.com/sources/packages.txt |
| 274 | http://example.com/sources/packages.txt |
| 275 | http://example.com/sources |
| 276 | ftp://example.com/sources |
| 277 | http://some.where.com/dir/file.txt |
| 278 | http://some.where.com/dir/a.txt |
| 279 | http://foo.com/X.tgz |
| 280 | ftp://foo.com/X.tgz |
| 281 | http://foo/ |
| 282 | http://www.foo.com:8000/ |
| 283 | http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args |
| 284 | http://decoded/mirror/path |
| 285 | http://a/b/c/d/e/f/g/h/i/j |
| 286 | http://foo/bar.gz |
| 287 | ftp://ftp.perl.org |
| 288 | http://purl.org/rss/1.0/modules/taxonomy/ |
| 289 | ftp://ftp.sun.ac.za/CPAN/CPAN/ |
| 290 | ftp://ftp.cpan.org/pub/mirror/index.txt |
| 291 | ftp://cpan.org/pub/mirror/index.txt |
| 292 | http://example.com/~eh/ |
| 293 | http://plagger.org/.../rss |
| 294 | http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz |
| 295 | http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz |
| 296 | http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz |
| 297 | http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz |
| 298 | http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz |
| 299 | http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip |
| 300 | http://module-build.sourceforge.net/META-spec-new.html |
| 301 | http://module-build.sourceforge.net/META-spec-v1.4.html |
| 302 | http://www.cs.vu.nl/~tmgil/vi.html |
| 303 | http://perlcomposer.sourceforge.net/vperl.html |
| 304 | http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep |
| 305 | http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html |
| 306 | http://world.std.com/~aep/ptkdb/ |
| 307 | http://www.castlelink.co.uk/object_system/ |
| 308 | http://www.fh-wedel.de/elvis/ |
| 309 | ftp://ftp.blarg.net/users/amol/zsh/ |
| 310 | ftp://ftp.funet.fi/pub/languages/perl/CPAN |
| 311 | http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip |
| 312 | http://users.perl5.git.perl.org/~USERNAME |
| 313 | http://foo/x//y/script.cgi/a//b |
| 314 | http://xxx/script.cgi/http://foo |
| 315 | http://foo/./x//z/script.cgi/a/../b//c |
| 316 | http://somewhere.else/in/movie/land |
| 317 | http://somewhere.else/finished.html |
| 318 | http://somewhere.else/bin/foo&bar$ |
| 319 | http://somewhere.else/ |
| 320 | http://proxy:8484/ |
| 321 | http://proxy/ |
| 322 | http://myrepo.example.com/ |
| 323 | http://remote/source |
| 324 | https://example.com/ |
| 325 | http://example.com:1024/ |
| 326 | http:///path?foo=bar |
| 327 | http://[::]:1024/ |
| 328 | http://([/ |
| 329 | http://example.com:9000/index.html |
| 330 | http://proxy.example.com:8080/ |
| 331 | http:///index.html |
| 332 | http://[www.json::pp.org]/ |
| 333 | http://localhost/ |
| 334 | http://foo.example.com/ |
| 335 | http://abc.com/a.js |
| 336 | http://whatever/man/1/crontab |
| 337 | http://abc.com/c.js |
| 338 | http://whatever/Foo%3A%3ABar |
| 339 | http://abc.com/b.js |
| 340 | http://remote.server.com/jquery.css |
| 341 | http://some.other.com/page.html |
| 342 | https://text.com/1/2 |
| 343 | https://text.com/1/2 |
| 344 | http://link.included.here?o=1&p=2 |
| 345 | http://link.included.here?o=1&p=2 |
| 346 | http://link.included.here?o=1&p=2 |
| 347 | http://link.included.here/ |
| 348 | http://foo/x//y/script.cgi/a//b |
| 349 | http://xxx/script.cgi/http://foo |
| 350 | http://foo/./x//z/script.cgi/a/../b//c |
| 351 | http://somewhere.else/in/movie/land |
| 352 | http://somewhere.else/finished.html |
| 353 | http://webproxy:3128/ |
| 354 | http://www/ |
| 355 | |
| 356 | # these are used to generate or match URLs |
| 357 | http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist |
| 358 | http://www.cpantesters.org/show/%s.yaml |
| 359 | ftp://(.*?)/(.*)/(.* |
| 360 | ftp://(.*?)/(.*)/(.* |
| 361 | ftp://(.*?)/(.*)/(.* |
| 362 | ftp://ftp.foo.bar/ |
| 363 | http://$host/ |
| 364 | http://wwwe%3C46/ |
| 365 | ftp:/ |
| 366 | http://$addr/mark?commit=$ |
| 367 | http://search.cpan.org/~ |
| 368 | http:/ |
| 369 | ftp:%5Cn$url |
| 370 | http://www.ietf.org/rfc/rfc$2.txt |
| 371 | http://search.cpan.org/~ |
| 372 | ftp:%5Cn$url |
| 373 | |
| 374 | # weird redirects that LWP doesn't like |
| 375 | http://www.theperlreview.com/community_calendar |
| 376 | http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL |
| 377 | http://sunsolve.sun.com |
| 378 | |
| 379 | # broken webserver that doesn't like HEAD requests |
| 380 | http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view |
| 381 | http://www.w3.org/TR/html4/loose.dtd |
| 382 | |
| 383 | # these have been reported upstream to CPAN authors |
| 384 | http://www.gnu.org/manual/tar/html_node/tar_139.html |
| 385 | http://www.w3.org/pub/WWW/TR/Wd-css-1.html |
| 386 | http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP |
| 387 | http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp |
| 388 | http://search.cpan.org/search?query=Module::Build::Convert |
| 389 | http://www.refcnt.org/papers/module-build-convert |
| 390 | http://csrc.nist.gov/cryptval/shs.html |
| 391 | http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp |
| 392 | http://www.debian.or.jp/~kubota/unicode-symbols.html.en |
| 393 | http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html |
| 394 | http://www.debian.or.jp/~kubota/unicode-symbols.html.en |
| 395 | http://rfc.net/rfc2781.html |
| 396 | http://www.icu-project.org/charset/ |
| 397 | http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html |
| 398 | http://www.rfc-editor.org/ |
| 399 | http://www.rfc.net/ |
| 400 | http://www.oreilly.com/people/authors/lunde/cjk_inf.html |
| 401 | http://www.oreilly.com/catalog/cjkvinfo/ |
| 402 | http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz |
| 403 | http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz |
| 404 | http://www.egt.ie/standards/iso3166/iso3166-1-en.html |
| 405 | http://www.bsi-global.com/iso4217currency |
| 406 | http://www.plover.com/~mjd/perl/Memoize/ |
| 407 | http://www.plover.com/~mjd/perl/MiniMemoize/ |
| 408 | http://www.sysadminmag.com/tpj/issues/vol5_5/ |
| 409 | ftp://ftp.tpc.int/tpc/server/UNIX/ |
| 410 | http://www.nara.gov/genealogy/ |
| 411 | http://home.utah-inter.net/kinsearch/Soundex.html |
| 412 | http://www.nara.gov/genealogy/soundex/soundex.html |
| 413 | http://rfc.net/rfc3461.html |
| 414 | ftp://ftp.cs.pdx.edu/pub/elvis/ |
| 415 | http://www.fh-wedel.de/elvis/ |
| 416 | http://lists.perl.org/list/perl-mvs.html |
| 417 | http://www.cpan.org/ports/os2/ |
| 418 | http://github.com/dagolden/cpan-meta-spec |
| 419 | http://github.com/dagolden/cpan-meta-spec/issues |
| 420 | http://www.opensource.org/licenses/lgpl-license.phpt |
| 421 | http://reality.sgi.com/ariel |
| 422 | http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html |
| 423 | http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html |
| 424 | http://www.nsrl.nist.gov/testdata/ |
| 425 | http://public.activestate.com/cgi-bin/perlbrowse/p/31194 |
| 426 | http://public.activestate.com/cgi-bin/perlbrowse?patch=16173 |
| 427 | http://public.activestate.com/cgi-bin/perlbrowse?patch=16049 |
| 428 | http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html |
| 429 | http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118 |
| 430 | http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut |
| 431 | http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf |
| 432 | http://github.com/schwern/extutils-makemaker |
| 433 | https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite |
| 434 | http://www.json.org/JSON::PP_checker/ |
| 435 | ftp://ftp.kiae.su/pub/unix/fido/ |
| 436 | http://www.gallistel.net/nparker/weather/code/ |
| 437 | http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html |
| 438 | ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/ |
| 439 | http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html |
| 440 | http://public.activestate.com/cgi-bin/perlbrowse/p/33567 |
| 441 | http://public.activestate.com/cgi-bin/perlbrowse/p/33566 |
| 442 | http://www.dsmit.com/cons/ |
| 443 | http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide |
| 444 | |
| 445 | __END__ |
| 446 | |
| 447 | =head1 NAME |
| 448 | |
| 449 | checkURL.pl - Check that all the URLs in the Perl source are valid |
| 450 | |
| 451 | =head1 DESCRIPTION |
| 452 | |
| 453 | This program checks that all the URLs in the Perl source are valid. It |
| 454 | checks HTTP and FTP links in parallel and contains a list of known |
| 455 | bad example links in its source. It takes 4 minutes to run on my |
| 456 | machine. The results are written to 'uris.txt' and list the filename, |
| 457 | the URL and the error: |
| 458 | |
| 459 | * ext/Locale-Maketext/lib/Locale/Maketext.pod |
| 460 | http://sunsite.dk/RFC/rfc/rfc2277.html |
| 461 | 404 Not Found |
| 462 | ... |
| 463 | |
| 464 | It should be run every so often and links fixed and upstream authors |
| 465 | notified. |
| 466 | |
| 467 | Note that the web is unstable and some websites are temporarily down. |