1#!perl 2use strict; 3use warnings; 4use autodie; 5use feature qw(say); 6use File::Find::Rule; 7use File::Slurp; 8use File::Spec; 9use IO::Socket::SSL; 10use List::Util qw(sum); 11use LWP::UserAgent; 12use Net::FTP; 13use Parallel::Fork::BossWorkerAsync; 14use Term::ProgressBar::Simple; 15use URI::Find::Simple qw( list_uris ); 16$| = 1; 17 18my %ignore; 19while ( my $line = <main::DATA> ) { 20 chomp $line; 21 next if $line =~ /^#/; 22 next unless $line; 23 $ignore{$line} = 1; 24} 25 26my $ua = LWP::UserAgent->new; 27$ua->timeout(58); 28$ua->env_proxy; 29 30my @filenames = @ARGV; 31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.') 32 unless @filenames; 33 34my $total_bytes = sum map {-s} @filenames; 35 36my $extract_progress = Term::ProgressBar::Simple->new( 37 { count => $total_bytes, 38 name => 'Extracting URIs', 39 } 40); 41 42my %uris; 43foreach my $filename (@filenames) { 44 next if $filename =~ /uris\.txt/; 45 next if $filename =~ /check_uris/; 46 next if $filename =~ /\.patch$/; 47 my $contents = read_file($filename); 48 my @uris = list_uris($contents); 49 foreach my $uri (@uris) { 50 next unless $uri =~ /^(http|ftp)/; 51 next if $ignore{$uri}; 52 53 # no need to hit rt.perl.org 54 next 55 if $uri =~ m{^http://rt.perl.org/rt3/Ticket/Display.html?id=\d+$}; 56 57 # no need to hit rt.cpan.org 58 next 59 if $uri =~ m{^http://rt.cpan.org/Public/Bug/Display.html?id=\d+$}; 60 push @{ $uris{$uri} }, $filename; 61 } 62 $extract_progress += -s $filename; 63} 64 65my $bw = Parallel::Fork::BossWorkerAsync->new( 66 work_handler => \&work_alarmed, 67 global_timeout => 120, 68 worker_count => 20, 69); 70 71foreach my $uri ( keys %uris ) { 72 my @filenames = @{ $uris{$uri} }; 73 $bw->add_work( { uri => $uri, filenames => \@filenames } ); 74} 75 76undef $extract_progress; 77 78my $fetch_progress = Term::ProgressBar::Simple->new( 79 { count => scalar( keys %uris ), 80 name => 'Fetching URIs', 81 } 82); 83 84my %filenames; 85while ( $bw->pending() ) { 86 my $response = $bw->get_result(); 87 my $uri = $response->{uri}; 88 my @filenames = @{ $response->{filenames} }; 89 my $is_success = $response->{is_success}; 90 my $message = $response->{message}; 91 92 unless ($is_success) { 93 foreach my $filename (@filenames) { 94 push @{ $filenames{$filename} }, 95 { uri => $uri, message => $message }; 96 } 97 } 98 $fetch_progress++; 99} 100$bw->shut_down(); 101 102my $fh = IO::File->new('> uris.txt'); 103foreach my $filename ( sort keys %filenames ) { 104 $fh->say("* $filename"); 105 my @bits = @{ $filenames{$filename} }; 106 foreach my $bit (@bits) { 107 my $uri = $bit->{uri}; 108 my $message = $bit->{message}; 109 $fh->say(" $uri"); 110 $fh->say(" $message"); 111 } 112} 113$fh->close; 114 115say 'Finished, see uris.txt'; 116 117sub work_alarmed { 118 my $conf = shift; 119 eval { 120 local $SIG{ALRM} = sub { die "alarm\n" }; # NB: \n required 121 alarm 60; 122 $conf = work($conf); 123 alarm 0; 124 }; 125 if ($@) { 126 $conf->{is_success} = 0; 127 $conf->{message} = 'Timed out'; 128 129 } 130 return $conf; 131} 132 133sub work { 134 my $conf = shift; 135 my $uri = $conf->{uri}; 136 my @filenames = @{ $conf->{filenames} }; 137 138 if ( $uri =~ /^http/ ) { 139 my $uri_without_fragment = URI->new($uri); 140 my $fragment = $uri_without_fragment->fragment(undef); 141 my $response = $ua->head($uri_without_fragment); 142 143 $conf->{is_success} = $response->is_success; 144 $conf->{message} = $response->status_line; 145 return $conf; 146 } else { 147 148 my $uri_object = URI->new($uri); 149 my $host = $uri_object->host; 150 my $path = $uri_object->path; 151 my ( $volume, $directories, $filename ) 152 = File::Spec->splitpath($path); 153 154 my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 ); 155 unless ($ftp) { 156 $conf->{is_succcess} = 0; 157 $conf->{message} = "Can not connect to $host: $@"; 158 return $conf; 159 } 160 161 my $can_login = $ftp->login( "anonymous", '-anonymous@' ); 162 unless ($can_login) { 163 $conf->{is_success} = 0; 164 $conf->{message} = "Can not login ", $ftp->message; 165 return $conf; 166 } 167 168 my $can_binary = $ftp->binary(); 169 unless ($can_binary) { 170 $conf->{is_success} = 0; 171 $conf->{message} = "Can not binary ", $ftp->message; 172 return $conf; 173 } 174 175 my $can_cwd = $ftp->cwd($directories); 176 unless ($can_cwd) { 177 $conf->{is_success} = 0; 178 $conf->{message} = "Can not cwd to $directories ", $ftp->message; 179 return $conf; 180 } 181 182 if ($filename) { 183 my $can_size = $ftp->size($filename); 184 unless ($can_size) { 185 $conf->{is_success} = 0; 186 $conf->{message} 187 = "Can not size $filename in $directories", 188 $ftp->message; 189 return $conf; 190 } 191 } else { 192 my ($can_dir) = $ftp->dir; 193 unless ($can_dir) { 194 my ($can_ls) = $ftp->ls; 195 unless ($can_ls) { 196 $conf->{is_success} = 0; 197 $conf->{message} 198 = "Can not dir or ls in $directories ", 199 $ftp->message; 200 return $conf; 201 } 202 } 203 } 204 205 $conf->{is_success} = 1; 206 return $conf; 207 } 208} 209 210__DATA__ 211# these are fine but give errors 212ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html 213ftp://ftp.stratus.com/pub/vos/utility/utility.html 214 215# this is missing, sigh 216ftp://ftp.sco.com/SLS/ptf7051e.Z 217http://perlmonks.thepen.com/42898.html 218 219# this are URI extraction bugs 220http://www.perl.org/E 221http://en.wikipedia.org/wiki/SREC_(file_format 222http://somewhere.else',-type=/ 223ftp:passive-mode 224ftp: 225http:[- 226http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell 227http://www.xray.mpe.mpg.de/mailing-lists/perl5- 228http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP: 229 230# these are used as an example 231http://example.com/ 232http://something.here/ 233http://users.perl5.git.perl.org/~yourlogin/ 234http://github.com/USERNAME/perl/tree/orange 235http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi 236http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar 237http://somewhere.else$/ 238http://somewhere.else$/ 239http://somewhere.else/bin/foo&bar',-Type= 240http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi 241http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar 242http://www.perl.org/test.cgi 243http://cpan2.local/ 244http://search.cpan.org/perldoc? 245http://cpan1.local/ 246http://cpan.dev.local/CPAN 247http:/// 248ftp:// 249ftp://myurl/ 250ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff 251http://www14.software.ibm.com/webapp/download/downloadaz.jsp 252http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz 253http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT 254http://localhost/tmp/index.txt 255http://example.com/foo/bar.html 256http://example.com/Text-Bastardize-1.06.tar.gz 257ftp://example.com/sources/packages.txt 258http://example.com/sources/packages.txt 259http://example.com/sources 260ftp://example.com/sources 261http://some.where.com/dir/file.txt 262http://some.where.com/dir/a.txt 263http://foo.com/X.tgz 264ftp://foo.com/X.tgz 265http://foo/ 266http://www.foo.com:8000/ 267http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args 268http://decoded/mirror/path 269http://a/b/c/d/e/f/g/h/i/j 270http://foo/bar.gz 271ftp://ftp.perl.org 272http://purl.org/rss/1.0/modules/taxonomy/ 273ftp://ftp.sun.ac.za/CPAN/CPAN/ 274ftp://ftp.cpan.org/pub/mirror/index.txt 275ftp://cpan.org/pub/mirror/index.txt 276http://example.com/~eh/ 277http://plagger.org/.../rss 278http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz 279http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz 280http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz 281http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz 282http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz 283http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip 284http://module-build.sourceforge.net/META-spec-new.html 285http://module-build.sourceforge.net/META-spec-v1.4.html 286http://www.cs.vu.nl/~tmgil/vi.html 287http://perlcomposer.sourceforge.net/vperl.html 288http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep 289http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html 290http://world.std.com/~aep/ptkdb/ 291http://www.castlelink.co.uk/object_system/ 292http://www.fh-wedel.de/elvis/ 293ftp://ftp.blarg.net/users/amol/zsh/ 294ftp://ftp.funet.fi/pub/languages/perl/CPAN 295http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip 296 297# these are used to generate or match URLs 298http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist 299http://www.cpantesters.org/show/%s.yaml 300ftp://(.*?)/(.*)/(.* 301ftp://(.*?)/(.*)/(.* 302ftp://(.*?)/(.*)/(.* 303ftp://ftp.foo.bar/ 304http://$host/ 305http://wwwe%3C46/ 306ftp:/ 307 308# weird redirects that LWP doesn't like 309http://www.theperlreview.com/community_calendar 310http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL 311http://groups.google.com/ 312http://groups.google.com/group/comp.lang.perl.misc/topics 313http://groups.google.co.uk/group/perl.perl5.porters/browse_thread/thread/b4e2b1d88280ddff/630b667a66e3509f?#630b667a66e3509f 314http://groups.google.com/group/comp.sys.sgi.admin/msg/3ad8353bc4ce3cb0 315http://groups.google.com/group/perl.module.build/browse_thread/thread/c8065052b2e0d741 316http://groups.google.com/group/perl.perl5.porters/browse_thread/thread/adaffaaf939b042e/20dafc298df737f0%2320dafc298df737f0?sa=X&oi=groupsr&start=0&num=3 317 318# broken webserver that doesn't like HEAD requests 319http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view 320 321# these have been reported upstream to CPAN authors 322http://www.gnu.org/manual/tar/html_node/tar_139.html 323http://www.w3.org/pub/WWW/TR/Wd-css-1.html 324http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP 325http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp 326http://search.cpan.org/search?query=Module::Build::Convert 327http://www.refcnt.org/papers/module-build-convert 328http://csrc.nist.gov/cryptval/shs.html 329http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp 330http://www.debian.or.jp/~kubota/unicode-symbols.html.en 331http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html 332http://www.debian.or.jp/~kubota/unicode-symbols.html.en 333http://rfc.net/rfc2781.html 334http://www.icu-project.org/charset/ 335http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html 336http://www.rfc-editor.org/ 337http://www.rfc.net/ 338http://www.oreilly.com/people/authors/lunde/cjk_inf.html 339http://www.oreilly.com/catalog/cjkvinfo/ 340http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz 341http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz 342http://www.egt.ie/standards/iso3166/iso3166-1-en.html 343http://www.bsi-global.com/iso4217currency 344http://www.plover.com/~mjd/perl/Memoize/ 345http://www.plover.com/~mjd/perl/MiniMemoize/ 346http://www.sysadminmag.com/tpj/issues/vol5_5/ 347ftp://ftp.tpc.int/tpc/server/UNIX/ 348http://www.nara.gov/genealogy/ 349http://home.utah-inter.net/kinsearch/Soundex.html 350http://www.nara.gov/genealogy/soundex/soundex.html 351http://rfc.net/rfc3461.html 352ftp://ftp.cs.pdx.edu/pub/elvis/ 353http://www.fh-wedel.de/elvis/ 354 355__END__ 356 357=head1 NAME 358 359checkURL.pl - Check that all the URLs in the Perl source are valid 360 361=head1 DESCRIPTION 362 363This program checks that all the URLs in the Perl source are valid. It 364checks HTTP and FTP links in parallel and contains a list of known 365bad example links in its source. It takes 4 minutes to run on my 366machine. The results are written to 'uris.txt' and list the filename, 367the URL and the error: 368 369 * ext/Locale-Maketext/lib/Locale/Maketext.pod 370 http://sunsite.dk/RFC/rfc/rfc2277.html 371 404 Not Found 372 ... 373 374It should be run every so often and links fixed and upstream authors 375notified. 376 377Note that the web is unstable and some websites are temporarily down. 378