1#!perl 2use strict; 3use warnings; 4use autodie; 5use feature qw(say); 6require File::Find::Rule; 7require File::Slurp; 8require File::Spec; 9require IO::Socket::SSL; 10use List::Util qw(sum); 11require LWP::UserAgent; 12require Net::FTP; 13require Parallel::Fork::BossWorkerAsync; 14require Term::ProgressBar::Simple; 15require URI::Find::Simple; 16$| = 1; 17 18my %ignore; 19while ( my $line = <main::DATA> ) { 20 chomp $line; 21 next if $line =~ /^#/; 22 next unless $line; 23 $ignore{$line} = 1; 24} 25 26my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 }); 27$ua->timeout(58); 28$ua->env_proxy; 29 30my @filenames = @ARGV; 31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.') 32 unless @filenames; 33 34my $total_bytes = sum map {-s} @filenames; 35 36my $extract_progress = Term::ProgressBar::Simple->new( 37 { count => $total_bytes, 38 name => 'Extracting URIs', 39 } 40); 41 42my %uris; 43foreach my $filename (@filenames) { 44 next if $filename =~ /uris\.txt/; 45 next if $filename =~ /check_uris/; 46 next if $filename =~ /\.patch$/; 47 next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod'; 48 next if $filename =~ /checkURL\.pl$/; 49 my $contents = File::Slurp::read_file($filename); 50 my @uris = URI::Find::Simple::list_uris($contents); 51 foreach my $uri (@uris) { 52 next unless $uri =~ /^(http|ftp)/; 53 next if $ignore{$uri}; 54 55 # no need to hit rt.perl.org 56 next 57 if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$}; 58 59 # no need to hit github 60 next 61 if $uri =~ m{^https?://(?:www\.)?github\.com/[pP]erl/perl5/issues/\d+$}; 62 63 # no need to hit rt.cpan.org 64 next 65 if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$}; 66 67 # no need to hit google groups (weird redirect LWP does not like) 68 next 69 if $uri =~ m{^http://groups\.google\.com/}; 70 71 push @{ $uris{$uri} }, $filename; 72 } 73 $extract_progress += -s $filename; 74} 75 76my $bw = Parallel::Fork::BossWorkerAsync->new( 77 work_handler => \&work_alarmed, 78 global_timeout => 120, 79 worker_count => 20, 80); 81 82foreach my $uri ( keys %uris ) { 83 my @filenames = @{ $uris{$uri} }; 84 $bw->add_work( { uri => $uri, filenames => \@filenames } ); 85} 86 87undef $extract_progress; 88 89my $fetch_progress = Term::ProgressBar::Simple->new( 90 { count => scalar( keys %uris ), 91 name => 'Fetching URIs', 92 } 93); 94 95my %filenames; 96while ( $bw->pending() ) { 97 my $response = $bw->get_result(); 98 my $uri = $response->{uri}; 99 my @filenames = @{ $response->{filenames} }; 100 my $is_success = $response->{is_success}; 101 my $message = $response->{message}; 102 103 unless ($is_success) { 104 foreach my $filename (@filenames) { 105 push @{ $filenames{$filename} }, 106 { uri => $uri, message => $message }; 107 } 108 } 109 $fetch_progress++; 110} 111$bw->shut_down(); 112 113my $fh = IO::File->new('> uris.txt'); 114foreach my $filename ( sort keys %filenames ) { 115 $fh->say("* $filename"); 116 my @bits = @{ $filenames{$filename} }; 117 foreach my $bit (@bits) { 118 my $uri = $bit->{uri}; 119 my $message = $bit->{message}; 120 $fh->say(" $uri"); 121 $fh->say(" $message"); 122 } 123} 124$fh->close; 125 126say 'Finished, see uris.txt'; 127 128sub work_alarmed { 129 my $conf = shift; 130 eval { 131 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required 132 alarm 60; 133 $conf = work($conf); 134 alarm 0; 135 }; 136 if ($@) { 137 $conf->{is_success} = 0; 138 $conf->{message} = 'Timed out'; 139 140 } 141 return $conf; 142} 143 144sub work { 145 my $conf = shift; 146 my $uri = $conf->{uri}; 147 my @filenames = @{ $conf->{filenames} }; 148 149 if ( $uri =~ /^http/ ) { 150 my $uri_without_fragment = URI->new($uri); 151 my $fragment = $uri_without_fragment->fragment(undef); 152 my $response = $ua->head($uri_without_fragment); 153 154 $conf->{is_success} = $response->is_success; 155 $conf->{message} = $response->status_line; 156 return $conf; 157 } else { 158 159 my $uri_object = URI->new($uri); 160 my $host = $uri_object->host; 161 my $path = $uri_object->path; 162 my ( $volume, $directories, $filename ) 163 = File::Spec->splitpath($path); 164 165 my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 ); 166 unless ($ftp) { 167 $conf->{is_succcess} = 0; 168 $conf->{message} = "Can not connect to $host: $@"; 169 return $conf; 170 } 171 172 my $can_login = $ftp->login( "anonymous", '-anonymous@' ); 173 unless ($can_login) { 174 $conf->{is_success} = 0; 175 $conf->{message} = "Can not login ", $ftp->message; 176 return $conf; 177 } 178 179 my $can_binary = $ftp->binary(); 180 unless ($can_binary) { 181 $conf->{is_success} = 0; 182 $conf->{message} = "Can not binary ", $ftp->message; 183 return $conf; 184 } 185 186 my $can_cwd = $ftp->cwd($directories); 187 unless ($can_cwd) { 188 $conf->{is_success} = 0; 189 $conf->{message} = "Can not cwd to $directories ", $ftp->message; 190 return $conf; 191 } 192 193 if ($filename) { 194 my $can_size = $ftp->size($filename); 195 unless ($can_size) { 196 $conf->{is_success} = 0; 197 $conf->{message} 198 = "Can not size $filename in $directories", 199 $ftp->message; 200 return $conf; 201 } 202 } else { 203 my ($can_dir) = $ftp->dir; 204 unless ($can_dir) { 205 my ($can_ls) = $ftp->ls; 206 unless ($can_ls) { 207 $conf->{is_success} = 0; 208 $conf->{message} 209 = "Can not dir or ls in $directories ", 210 $ftp->message; 211 return $conf; 212 } 213 } 214 } 215 216 $conf->{is_success} = 1; 217 return $conf; 218 } 219} 220 221__DATA__ 222# these are fine but give errors 223ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html 224ftp://ftp.stratus.com/pub/vos/utility/utility.html 225 226# these are missing, sigh 227ftp://ftp.sco.com/SLS/ptf7051e.Z 228http://perlmonks.thepen.com/42898.html 229http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/ 230http://public.activestate.com/cgi-bin/perlbrowse 231http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog 232http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631 233http://my.smithmicro.com/mac/stuffit/ 234http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html 235http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html 236http://www.openzaurus.org/ 237http://Casbah.org/ 238http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe 239http://www.pvhp.com/~pvhp/ 240http://www.pvhp.com/%7Epvhp/ 241http://www.pvhp.com/%7epvhp/ 242http://www.leo.org 243http://www.madgoat.com 244http://www.mks.com/s390/gnu/ 245http://www.research.att.com/sw/tools/uwin/ 246http://www.tpj.com/ 247http://safaribooksonline.com/ 248http://use.perl.org/~autrijus/journal/25768 249http://www.s390.ibm.com/products/oe/bpxqp1.html 250http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01396.html 251http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01489.html 252http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01491.html 253http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01608.html 254http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02144.html 255http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02998.html 256http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1999-03/msg00520.html 257http://www.w3.org/Security/Faq/ 258 259# these are URI extraction bugs 260http://www.perl.org/E 261http://en.wikipedia.org/wiki/SREC_(file_format 262http://somewhere.else',-type=/ 263ftp:passive-mode 264ftp: 265http:[- 266http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell 267http://www.xray.mpe.mpg.de/mailing-lists/perl5- 268http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP: 269http://perl.come/ 270http://www.perl.come/ 271 272# these are used as an example 273http://example.com/ 274http://something.here/ 275http://users.perl5.git.perl.org/~yourlogin/ 276http://github.com/USERNAME/perl/tree/orange 277http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi 278http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar 279http://somewhere.else$/ 280http://somewhere.else$/ 281http://somewhere.else/bin/foo&bar',-Type= 282http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi 283http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar 284http://www.perl.org/test.cgi 285http://cpan2.local/ 286http://search.cpan.org/perldoc? 287http://cpan1.local/ 288http://cpan.dev.local/CPAN 289http:/// 290ftp:// 291ftp://myurl/ 292ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff 293http://www14.software.ibm.com/webapp/download/downloadaz.jsp 294http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz 295http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT 296http://localhost/tmp/index.txt 297http://example.com/foo/bar.html 298http://example.com/Text-Bastardize-1.06.tar.gz 299ftp://example.com/sources/packages.txt 300http://example.com/sources/packages.txt 301http://example.com/sources 302ftp://example.com/sources 303http://some.where.com/dir/file.txt 304http://some.where.com/dir/a.txt 305http://foo.com/X.tgz 306ftp://foo.com/X.tgz 307http://foo/ 308http://www.foo.com:8000/ 309http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args 310http://decoded/mirror/path 311http://a/b/c/d/e/f/g/h/i/j 312http://foo/bar.gz 313ftp://ftp.perl.org 314http://purl.org/rss/1.0/modules/taxonomy/ 315ftp://ftp.sun.ac.za/CPAN/CPAN/ 316ftp://ftp.cpan.org/pub/mirror/index.txt 317ftp://cpan.org/pub/mirror/index.txt 318http://example.com/~eh/ 319http://plagger.org/.../rss 320http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz 321http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz 322http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz 323http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz 324http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz 325http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip 326http://module-build.sourceforge.net/META-spec-new.html 327http://module-build.sourceforge.net/META-spec-v1.4.html 328http://www.cs.vu.nl/~tmgil/vi.html 329http://perlcomposer.sourceforge.net/vperl.html 330http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep 331http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html 332http://world.std.com/~aep/ptkdb/ 333http://www.castlelink.co.uk/object_system/ 334http://www.fh-wedel.de/elvis/ 335ftp://ftp.blarg.net/users/amol/zsh/ 336ftp://ftp.funet.fi/pub/languages/perl/CPAN 337http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip 338http://users.perl5.git.perl.org/~USERNAME 339http://foo/x//y/script.cgi/a//b 340http://xxx/script.cgi/http://foo 341http://foo/./x//z/script.cgi/a/../b//c 342http://somewhere.else/in/movie/land 343http://somewhere.else/finished.html 344http://somewhere.else/bin/foo&bar$ 345http://somewhere.else/ 346http://proxy:8484/ 347http://proxy/ 348http://myrepo.example.com/ 349http://remote/source 350https://example.com/ 351http://example.com:1024/ 352http:///path?foo=bar 353http://[::]:1024/ 354http://([/ 355http://example.com:9000/index.html 356http://proxy.example.com:8080/ 357http:///index.html 358http://[www.json::pp.org]/ 359http://localhost/ 360http://foo.example.com/ 361http://abc.com/a.js 362http://whatever/man/1/crontab 363http://abc.com/c.js 364http://whatever/Foo%3A%3ABar 365http://abc.com/b.js 366http://remote.server.com/jquery.css 367http://some.other.com/page.html 368https://text.com/1/2 369https://text.com/1/2 370http://link.included.here?o=1&p=2 371http://link.included.here?o=1&p=2 372http://link.included.here?o=1&p=2 373http://link.included.here/ 374http://foo/x//y/script.cgi/a//b 375http://xxx/script.cgi/http://foo 376http://foo/./x//z/script.cgi/a/../b//c 377http://somewhere.else/in/movie/land 378http://somewhere.else/finished.html 379http://webproxy:3128/ 380http://www/ 381 382# these are used to generate or match URLs 383http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist 384http://www.cpantesters.org/show/%s.yaml 385ftp://(.*?)/(.*)/(.* 386ftp://(.*?)/(.*)/(.* 387ftp://(.*?)/(.*)/(.* 388ftp://ftp.foo.bar/ 389http://$host/ 390http://wwwe%3C46/ 391ftp:/ 392http://$addr/mark?commit=$ 393http://search.cpan.org/~ 394http:/ 395ftp:%5Cn$url 396http://www.ietf.org/rfc/rfc$2.txt 397http://search.cpan.org/~ 398ftp:%5Cn$url 399 400# weird redirects that LWP doesn't like 401http://www.theperlreview.com/community_calendar 402http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL 403http://sunsolve.sun.com 404 405# broken webserver that doesn't like HEAD requests 406http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view 407http://www.w3.org/TR/html4/loose.dtd 408 409# these have been reported upstream to CPAN authors 410http://www.gnu.org/manual/tar/html_node/tar_139.html 411http://www.w3.org/pub/WWW/TR/Wd-css-1.html 412http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP 413http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp 414http://search.cpan.org/search?query=Module::Build::Convert 415http://www.refcnt.org/papers/module-build-convert 416http://csrc.nist.gov/cryptval/shs.html 417http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp 418http://www.debian.or.jp/~kubota/unicode-symbols.html.en 419http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html 420http://www.debian.or.jp/~kubota/unicode-symbols.html.en 421http://rfc.net/rfc2781.html 422http://www.icu-project.org/charset/ 423http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html 424http://www.rfc-editor.org/ 425http://www.rfc.net/ 426http://www.oreilly.com/people/authors/lunde/cjk_inf.html 427http://www.oreilly.com/catalog/cjkvinfo/ 428http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz 429http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz 430http://www.egt.ie/standards/iso3166/iso3166-1-en.html 431http://www.bsi-global.com/iso4217currency 432http://www.plover.com/~mjd/perl/Memoize/ 433http://www.plover.com/~mjd/perl/MiniMemoize/ 434http://www.sysadminmag.com/tpj/issues/vol5_5/ 435ftp://ftp.tpc.int/tpc/server/UNIX/ 436http://www.nara.gov/genealogy/ 437http://home.utah-inter.net/kinsearch/Soundex.html 438http://www.nara.gov/genealogy/soundex/soundex.html 439http://rfc.net/rfc3461.html 440ftp://ftp.cs.pdx.edu/pub/elvis/ 441http://www.fh-wedel.de/elvis/ 442http://lists.perl.org/list/perl-mvs.html 443http://www.cpan.org/ports/os2/ 444http://github.com/dagolden/cpan-meta-spec 445http://github.com/dagolden/cpan-meta-spec/issues 446http://www.opensource.org/licenses/lgpl-license.phpt 447http://reality.sgi.com/ariel 448http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html 449http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html 450http://www.nsrl.nist.gov/testdata/ 451http://public.activestate.com/cgi-bin/perlbrowse/p/31194 452http://public.activestate.com/cgi-bin/perlbrowse?patch=16173 453http://public.activestate.com/cgi-bin/perlbrowse?patch=16049 454http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html 455http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118 456http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut 457http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf 458http://github.com/schwern/extutils-makemaker 459https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite 460http://www.json.org/JSON::PP_checker/ 461ftp://ftp.kiae.su/pub/unix/fido/ 462http://www.gallistel.net/nparker/weather/code/ 463http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html 464ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/ 465http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html 466http://public.activestate.com/cgi-bin/perlbrowse/p/33567 467http://public.activestate.com/cgi-bin/perlbrowse/p/33566 468http://www.dsmit.com/cons/ 469http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide 470 471__END__ 472 473=head1 NAME 474 475checkURL.pl - Check that all the URLs in the Perl source are valid 476 477=head1 DESCRIPTION 478 479This program checks that all the URLs in the Perl source are valid. It 480checks HTTP and FTP links in parallel and contains a list of known 481bad example links in its source. It takes 4 minutes to run on my 482machine. The results are written to 'uris.txt' and list the filename, 483the URL and the error: 484 485 * ext/Locale-Maketext/lib/Locale/Maketext.pod 486 http://sunsite.dk/RFC/rfc/rfc2277.html 487 404 Not Found 488 ... 489 490It should be run every so often and links fixed and upstream authors 491notified. 492 493Note that the web is unstable and some websites are temporarily down. 494