xref: /openbsd-src/gnu/usr.bin/perl/Porting/checkURL.pl (revision d13be5d47e4149db2549a9828e244d59dbc43f15)
1#!perl
2use strict;
3use warnings;
4use autodie;
5use feature qw(say);
6use File::Find::Rule;
7use File::Slurp;
8use File::Spec;
9use IO::Socket::SSL;
10use List::Util qw(sum);
11use LWP::UserAgent;
12use Net::FTP;
13use Parallel::Fork::BossWorkerAsync;
14use Term::ProgressBar::Simple;
15use URI::Find::Simple qw( list_uris );
16$| = 1;
17
18my %ignore;
19while ( my $line = <main::DATA> ) {
20    chomp $line;
21    next if $line =~ /^#/;
22    next unless $line;
23    $ignore{$line} = 1;
24}
25
26my $ua = LWP::UserAgent->new;
27$ua->timeout(58);
28$ua->env_proxy;
29
30my @filenames = @ARGV;
31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32    unless @filenames;
33
34my $total_bytes = sum map {-s} @filenames;
35
36my $extract_progress = Term::ProgressBar::Simple->new(
37    {   count => $total_bytes,
38        name  => 'Extracting URIs',
39    }
40);
41
42my %uris;
43foreach my $filename (@filenames) {
44    next if $filename =~ /uris\.txt/;
45    next if $filename =~ /check_uris/;
46    next if $filename =~ /\.patch$/;
47    my $contents = read_file($filename);
48    my @uris     = list_uris($contents);
49    foreach my $uri (@uris) {
50        next unless $uri =~ /^(http|ftp)/;
51        next if $ignore{$uri};
52
53        # no need to hit rt.perl.org
54        next
55            if $uri =~ m{^http://rt.perl.org/rt3/Ticket/Display.html?id=\d+$};
56
57        # no need to hit rt.cpan.org
58        next
59            if $uri =~ m{^http://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
60        push @{ $uris{$uri} }, $filename;
61    }
62    $extract_progress += -s $filename;
63}
64
65my $bw = Parallel::Fork::BossWorkerAsync->new(
66    work_handler   => \&work_alarmed,
67    global_timeout => 120,
68    worker_count   => 20,
69);
70
71foreach my $uri ( keys %uris ) {
72    my @filenames = @{ $uris{$uri} };
73    $bw->add_work( { uri => $uri, filenames => \@filenames } );
74}
75
76undef $extract_progress;
77
78my $fetch_progress = Term::ProgressBar::Simple->new(
79    {   count => scalar( keys %uris ),
80        name  => 'Fetching URIs',
81    }
82);
83
84my %filenames;
85while ( $bw->pending() ) {
86    my $response   = $bw->get_result();
87    my $uri        = $response->{uri};
88    my @filenames  = @{ $response->{filenames} };
89    my $is_success = $response->{is_success};
90    my $message    = $response->{message};
91
92    unless ($is_success) {
93        foreach my $filename (@filenames) {
94            push @{ $filenames{$filename} },
95                { uri => $uri, message => $message };
96        }
97    }
98    $fetch_progress++;
99}
100$bw->shut_down();
101
102my $fh = IO::File->new('> uris.txt');
103foreach my $filename ( sort keys %filenames ) {
104    $fh->say("* $filename");
105    my @bits = @{ $filenames{$filename} };
106    foreach my $bit (@bits) {
107        my $uri     = $bit->{uri};
108        my $message = $bit->{message};
109        $fh->say("  $uri");
110        $fh->say("    $message");
111    }
112}
113$fh->close;
114
115say 'Finished, see uris.txt';
116
117sub work_alarmed {
118    my $conf = shift;
119    eval {
120        local $SIG{ALRM} = sub { die "alarm\n" };    # NB: \n required
121        alarm 60;
122        $conf = work($conf);
123        alarm 0;
124    };
125    if ($@) {
126        $conf->{is_success} = 0;
127        $conf->{message}    = 'Timed out';
128
129    }
130    return $conf;
131}
132
133sub work {
134    my $conf      = shift;
135    my $uri       = $conf->{uri};
136    my @filenames = @{ $conf->{filenames} };
137
138    if ( $uri =~ /^http/ ) {
139        my $uri_without_fragment = URI->new($uri);
140        my $fragment             = $uri_without_fragment->fragment(undef);
141        my $response             = $ua->head($uri_without_fragment);
142
143        $conf->{is_success} = $response->is_success;
144        $conf->{message}    = $response->status_line;
145        return $conf;
146    } else {
147
148        my $uri_object = URI->new($uri);
149        my $host       = $uri_object->host;
150        my $path       = $uri_object->path;
151        my ( $volume, $directories, $filename )
152            = File::Spec->splitpath($path);
153
154        my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
155        unless ($ftp) {
156            $conf->{is_succcess} = 0;
157            $conf->{message}     = "Can not connect to $host: $@";
158            return $conf;
159        }
160
161        my $can_login = $ftp->login( "anonymous", '-anonymous@' );
162        unless ($can_login) {
163            $conf->{is_success} = 0;
164            $conf->{message} = "Can not login ", $ftp->message;
165            return $conf;
166        }
167
168        my $can_binary = $ftp->binary();
169        unless ($can_binary) {
170            $conf->{is_success} = 0;
171            $conf->{message} = "Can not binary ", $ftp->message;
172            return $conf;
173        }
174
175        my $can_cwd = $ftp->cwd($directories);
176        unless ($can_cwd) {
177            $conf->{is_success} = 0;
178            $conf->{message} = "Can not cwd to $directories ", $ftp->message;
179            return $conf;
180        }
181
182        if ($filename) {
183            my $can_size = $ftp->size($filename);
184            unless ($can_size) {
185                $conf->{is_success} = 0;
186                $conf->{message}
187                    = "Can not size $filename in $directories",
188                    $ftp->message;
189                return $conf;
190            }
191        } else {
192            my ($can_dir) = $ftp->dir;
193            unless ($can_dir) {
194                my ($can_ls) = $ftp->ls;
195                unless ($can_ls) {
196                    $conf->{is_success} = 0;
197                    $conf->{message}
198                        = "Can not dir or ls in $directories ",
199                        $ftp->message;
200                    return $conf;
201                }
202            }
203        }
204
205        $conf->{is_success} = 1;
206        return $conf;
207    }
208}
209
210__DATA__
211# these are fine but give errors
212ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
213ftp://ftp.stratus.com/pub/vos/utility/utility.html
214
215# this is missing, sigh
216ftp://ftp.sco.com/SLS/ptf7051e.Z
217http://perlmonks.thepen.com/42898.html
218
219# this are URI extraction bugs
220http://www.perl.org/E
221http://en.wikipedia.org/wiki/SREC_(file_format
222http://somewhere.else',-type=/
223ftp:passive-mode
224ftp:
225http:[-
226http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
227http://www.xray.mpe.mpg.de/mailing-lists/perl5-
228http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
229
230# these are used as an example
231http://example.com/
232http://something.here/
233http://users.perl5.git.perl.org/~yourlogin/
234http://github.com/USERNAME/perl/tree/orange
235http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
236http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
237http://somewhere.else$/
238http://somewhere.else$/
239http://somewhere.else/bin/foo&bar',-Type=
240http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
241http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
242http://www.perl.org/test.cgi
243http://cpan2.local/
244http://search.cpan.org/perldoc?
245http://cpan1.local/
246http://cpan.dev.local/CPAN
247http:///
248ftp://
249ftp://myurl/
250ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
251http://www14.software.ibm.com/webapp/download/downloadaz.jsp
252http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
253http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
254http://localhost/tmp/index.txt
255http://example.com/foo/bar.html
256http://example.com/Text-Bastardize-1.06.tar.gz
257ftp://example.com/sources/packages.txt
258http://example.com/sources/packages.txt
259http://example.com/sources
260ftp://example.com/sources
261http://some.where.com/dir/file.txt
262http://some.where.com/dir/a.txt
263http://foo.com/X.tgz
264ftp://foo.com/X.tgz
265http://foo/
266http://www.foo.com:8000/
267http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
268http://decoded/mirror/path
269http://a/b/c/d/e/f/g/h/i/j
270http://foo/bar.gz
271ftp://ftp.perl.org
272http://purl.org/rss/1.0/modules/taxonomy/
273ftp://ftp.sun.ac.za/CPAN/CPAN/
274ftp://ftp.cpan.org/pub/mirror/index.txt
275ftp://cpan.org/pub/mirror/index.txt
276http://example.com/~eh/
277http://plagger.org/.../rss
278http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
279http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
280http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
281http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
282http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
283http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
284http://module-build.sourceforge.net/META-spec-new.html
285http://module-build.sourceforge.net/META-spec-v1.4.html
286http://www.cs.vu.nl/~tmgil/vi.html
287http://perlcomposer.sourceforge.net/vperl.html
288http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
289http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
290http://world.std.com/~aep/ptkdb/
291http://www.castlelink.co.uk/object_system/
292http://www.fh-wedel.de/elvis/
293ftp://ftp.blarg.net/users/amol/zsh/
294ftp://ftp.funet.fi/pub/languages/perl/CPAN
295http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
296
297# these are used to generate or match URLs
298http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
299http://www.cpantesters.org/show/%s.yaml
300ftp://(.*?)/(.*)/(.*
301ftp://(.*?)/(.*)/(.*
302ftp://(.*?)/(.*)/(.*
303ftp://ftp.foo.bar/
304http://$host/
305http://wwwe%3C46/
306ftp:/
307
308# weird redirects that LWP doesn't like
309http://www.theperlreview.com/community_calendar
310http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
311http://groups.google.com/
312http://groups.google.com/group/comp.lang.perl.misc/topics
313http://groups.google.co.uk/group/perl.perl5.porters/browse_thread/thread/b4e2b1d88280ddff/630b667a66e3509f?#630b667a66e3509f
314http://groups.google.com/group/comp.sys.sgi.admin/msg/3ad8353bc4ce3cb0
315http://groups.google.com/group/perl.module.build/browse_thread/thread/c8065052b2e0d741
316http://groups.google.com/group/perl.perl5.porters/browse_thread/thread/adaffaaf939b042e/20dafc298df737f0%2320dafc298df737f0?sa=X&oi=groupsr&start=0&num=3
317
318# broken webserver that doesn't like HEAD requests
319http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
320
321# these have been reported upstream to CPAN authors
322http://www.gnu.org/manual/tar/html_node/tar_139.html
323http://www.w3.org/pub/WWW/TR/Wd-css-1.html
324http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
325http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
326http://search.cpan.org/search?query=Module::Build::Convert
327http://www.refcnt.org/papers/module-build-convert
328http://csrc.nist.gov/cryptval/shs.html
329http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
330http://www.debian.or.jp/~kubota/unicode-symbols.html.en
331http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
332http://www.debian.or.jp/~kubota/unicode-symbols.html.en
333http://rfc.net/rfc2781.html
334http://www.icu-project.org/charset/
335http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
336http://www.rfc-editor.org/
337http://www.rfc.net/
338http://www.oreilly.com/people/authors/lunde/cjk_inf.html
339http://www.oreilly.com/catalog/cjkvinfo/
340http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
341http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
342http://www.egt.ie/standards/iso3166/iso3166-1-en.html
343http://www.bsi-global.com/iso4217currency
344http://www.plover.com/~mjd/perl/Memoize/
345http://www.plover.com/~mjd/perl/MiniMemoize/
346http://www.sysadminmag.com/tpj/issues/vol5_5/
347ftp://ftp.tpc.int/tpc/server/UNIX/
348http://www.nara.gov/genealogy/
349http://home.utah-inter.net/kinsearch/Soundex.html
350http://www.nara.gov/genealogy/soundex/soundex.html
351http://rfc.net/rfc3461.html
352ftp://ftp.cs.pdx.edu/pub/elvis/
353http://www.fh-wedel.de/elvis/
354
355__END__
356
357=head1 NAME
358
359checkURL.pl - Check that all the URLs in the Perl source are valid
360
361=head1 DESCRIPTION
362
363This program checks that all the URLs in the Perl source are valid. It
364checks HTTP and FTP links in parallel and contains a list of known
365bad example links in its source. It takes 4 minutes to run on my
366machine. The results are written to 'uris.txt' and list the filename,
367the URL and the error:
368
369  * ext/Locale-Maketext/lib/Locale/Maketext.pod
370    http://sunsite.dk/RFC/rfc/rfc2277.html
371      404 Not Found
372  ...
373
374It should be run every so often and links fixed and upstream authors
375notified.
376
377Note that the web is unstable and some websites are temporarily down.
378