xref: /openbsd-src/gnu/usr.bin/perl/Porting/checkURL.pl (revision f1dd7b858388b4a23f4f67a4957ec5ff656ebbe8)
1#!perl
2use strict;
3use warnings;
4use autodie;
5use feature qw(say);
6require File::Find::Rule;
7require File::Slurp;
8require File::Spec;
9require IO::Socket::SSL;
10use List::Util qw(sum);
11require LWP::UserAgent;
12require Net::FTP;
13require Parallel::Fork::BossWorkerAsync;
14require Term::ProgressBar::Simple;
15require URI::Find::Simple;
16$| = 1;
17
18my %ignore;
19while ( my $line = <main::DATA> ) {
20    chomp $line;
21    next if $line =~ /^#/;
22    next unless $line;
23    $ignore{$line} = 1;
24}
25
26my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 });
27$ua->timeout(58);
28$ua->env_proxy;
29
30my @filenames = @ARGV;
31@filenames = sort grep { $_ !~ /^\.git/ } File::Find::Rule->new->file->in('.')
32    unless @filenames;
33
34my $total_bytes = sum map {-s} @filenames;
35
36my $extract_progress = Term::ProgressBar::Simple->new(
37    {   count => $total_bytes,
38        name  => 'Extracting URIs',
39    }
40);
41
42my %uris;
43foreach my $filename (@filenames) {
44    next if $filename =~ /uris\.txt/;
45    next if $filename =~ /check_uris/;
46    next if $filename =~ /\.patch$/;
47    next if $filename =~ 'cpan/Pod-Simple/t/perlfaqo?\.pod';
48    next if $filename =~ /checkURL\.pl$/;
49    my $contents = File::Slurp::read_file($filename);
50    my @uris     = URI::Find::Simple::list_uris($contents);
51    foreach my $uri (@uris) {
52        next unless $uri =~ /^(http|ftp)/;
53        next if $ignore{$uri};
54
55        # no need to hit rt.perl.org
56        next
57            if $uri =~ m{^https?://rt.perl.org/(?:rt3/)?Ticket/Display.html?id=\d+$};
58
59        # no need to hit github
60        next
61            if $uri =~ m{^https?://(?:www\.)?github\.com/[pP]erl/perl5/issues/\d+$};
62
63        # no need to hit rt.cpan.org
64        next
65            if $uri =~ m{^https?://rt.cpan.org/Public/Bug/Display.html?id=\d+$};
66
67        # no need to hit google groups (weird redirect LWP does not like)
68        next
69            if $uri =~ m{^http://groups\.google\.com/};
70
71        push @{ $uris{$uri} }, $filename;
72    }
73    $extract_progress += -s $filename;
74}
75
76my $bw = Parallel::Fork::BossWorkerAsync->new(
77    work_handler   => \&work_alarmed,
78    global_timeout => 120,
79    worker_count   => 20,
80);
81
82foreach my $uri ( keys %uris ) {
83    my @filenames = @{ $uris{$uri} };
84    $bw->add_work( { uri => $uri, filenames => \@filenames } );
85}
86
87undef $extract_progress;
88
89my $fetch_progress = Term::ProgressBar::Simple->new(
90    {   count => scalar( keys %uris ),
91        name  => 'Fetching URIs',
92    }
93);
94
95my %filenames;
96while ( $bw->pending() ) {
97    my $response   = $bw->get_result();
98    my $uri        = $response->{uri};
99    my @filenames  = @{ $response->{filenames} };
100    my $is_success = $response->{is_success};
101    my $message    = $response->{message};
102
103    unless ($is_success) {
104        foreach my $filename (@filenames) {
105            push @{ $filenames{$filename} },
106                { uri => $uri, message => $message };
107        }
108    }
109    $fetch_progress++;
110}
111$bw->shut_down();
112
113my $fh = IO::File->new('> uris.txt');
114foreach my $filename ( sort keys %filenames ) {
115    $fh->say("* $filename");
116    my @bits = @{ $filenames{$filename} };
117    foreach my $bit (@bits) {
118        my $uri     = $bit->{uri};
119        my $message = $bit->{message};
120        $fh->say("  $uri");
121        $fh->say("    $message");
122    }
123}
124$fh->close;
125
126say 'Finished, see uris.txt';
127
128sub work_alarmed {
129    my $conf = shift;
130    eval {
131        local $SIG{ALRM} = sub { die "alarm\n" };    # NB: \n required
132        alarm 60;
133        $conf = work($conf);
134        alarm 0;
135    };
136    if ($@) {
137        $conf->{is_success} = 0;
138        $conf->{message}    = 'Timed out';
139
140    }
141    return $conf;
142}
143
144sub work {
145    my $conf      = shift;
146    my $uri       = $conf->{uri};
147    my @filenames = @{ $conf->{filenames} };
148
149    if ( $uri =~ /^http/ ) {
150        my $uri_without_fragment = URI->new($uri);
151        my $fragment             = $uri_without_fragment->fragment(undef);
152        my $response             = $ua->head($uri_without_fragment);
153
154        $conf->{is_success} = $response->is_success;
155        $conf->{message}    = $response->status_line;
156        return $conf;
157    } else {
158
159        my $uri_object = URI->new($uri);
160        my $host       = $uri_object->host;
161        my $path       = $uri_object->path;
162        my ( $volume, $directories, $filename )
163            = File::Spec->splitpath($path);
164
165        my $ftp = Net::FTP->new( $host, Passive => 1, Timeout => 60 );
166        unless ($ftp) {
167            $conf->{is_succcess} = 0;
168            $conf->{message}     = "Can not connect to $host: $@";
169            return $conf;
170        }
171
172        my $can_login = $ftp->login( "anonymous", '-anonymous@' );
173        unless ($can_login) {
174            $conf->{is_success} = 0;
175            $conf->{message} = "Can not login ", $ftp->message;
176            return $conf;
177        }
178
179        my $can_binary = $ftp->binary();
180        unless ($can_binary) {
181            $conf->{is_success} = 0;
182            $conf->{message} = "Can not binary ", $ftp->message;
183            return $conf;
184        }
185
186        my $can_cwd = $ftp->cwd($directories);
187        unless ($can_cwd) {
188            $conf->{is_success} = 0;
189            $conf->{message} = "Can not cwd to $directories ", $ftp->message;
190            return $conf;
191        }
192
193        if ($filename) {
194            my $can_size = $ftp->size($filename);
195            unless ($can_size) {
196                $conf->{is_success} = 0;
197                $conf->{message}
198                    = "Can not size $filename in $directories",
199                    $ftp->message;
200                return $conf;
201            }
202        } else {
203            my ($can_dir) = $ftp->dir;
204            unless ($can_dir) {
205                my ($can_ls) = $ftp->ls;
206                unless ($can_ls) {
207                    $conf->{is_success} = 0;
208                    $conf->{message}
209                        = "Can not dir or ls in $directories ",
210                        $ftp->message;
211                    return $conf;
212                }
213            }
214        }
215
216        $conf->{is_success} = 1;
217        return $conf;
218    }
219}
220
221__DATA__
222# these are fine but give errors
223ftp://ftp.stratus.com/pub/vos/posix/ga/ga.html
224ftp://ftp.stratus.com/pub/vos/utility/utility.html
225
226# these are missing, sigh
227ftp://ftp.sco.com/SLS/ptf7051e.Z
228http://perlmonks.thepen.com/42898.html
229http://svn.mutatus.co.uk/wsvn/Scalar-List-Utils/trunk/
230http://public.activestate.com/cgi-bin/perlbrowse
231http://svn.mutatus.co.uk/browse/libnet/tags/libnet-1.17/ChangeLog
232http://aspn.activestate.com/ASPN/Mail/Message/perl6-internals/2746631
233http://my.smithmicro.com/mac/stuffit/
234http://www.wg.omron.co.jp/cgi-bin/j-e/jfriedl.html
235http://persephone.cps.unizar.es/general/gente/spd/gzip/gzip.html
236http://www.openzaurus.org/
237http://Casbah.org/
238http://download.microsoft.com/download/vc15/Patch/1.52/W95/EN-US/nmake15.exe
239http://www.pvhp.com/~pvhp/
240http://www.pvhp.com/%7Epvhp/
241http://www.pvhp.com/%7epvhp/
242http://www.leo.org
243http://www.madgoat.com
244http://www.mks.com/s390/gnu/
245http://www.research.att.com/sw/tools/uwin/
246http://www.tpj.com/
247http://safaribooksonline.com/
248http://use.perl.org/~autrijus/journal/25768
249http://www.s390.ibm.com/products/oe/bpxqp1.html
250http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01396.html
251http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01489.html
252http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01491.html
253http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg01608.html
254http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02144.html
255http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1998-02/msg02998.html
256http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/1999-03/msg00520.html
257http://www.w3.org/Security/Faq/
258
259# these are URI extraction bugs
260http://www.perl.org/E
261http://en.wikipedia.org/wiki/SREC_(file_format
262http://somewhere.else',-type=/
263ftp:passive-mode
264ftp:
265http:[-
266http://search.cpan.org/src/KWILLIAMS/Module-Build-0.30/lib/Module/Build/Platform/Windows.pm:split_like_shell
267http://www.xray.mpe.mpg.de/mailing-lists/perl5-
268http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP:
269http://perl.come/
270http://www.perl.come/
271
272# these are used as an example
273http://example.com/
274http://something.here/
275http://users.perl5.git.perl.org/~yourlogin/
276http://github.com/USERNAME/perl/tree/orange
277http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
278http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
279http://somewhere.else$/
280http://somewhere.else$/
281http://somewhere.else/bin/foo&bar',-Type=
282http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi
283http://the.good.ship.lollypop.com:8080/cgi-bin/foo.cgi/somewhere/else?game=chess;game=checkers;weather=dull;foo=bar
284http://www.perl.org/test.cgi
285http://cpan2.local/
286http://search.cpan.org/perldoc?
287http://cpan1.local/
288http://cpan.dev.local/CPAN
289http:///
290ftp://
291ftp://myurl/
292ftp://ftp.software.ibm.com/aix/fixes/v4/other/vac.C.5.0.2.6.bff
293http://www14.software.ibm.com/webapp/download/downloadaz.jsp
294http://www.cpan.org/modules/by-module/Archive/Archive-Zip-*.tar.gz
295http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-*.TXT
296http://localhost/tmp/index.txt
297http://example.com/foo/bar.html
298http://example.com/Text-Bastardize-1.06.tar.gz
299ftp://example.com/sources/packages.txt
300http://example.com/sources/packages.txt
301http://example.com/sources
302ftp://example.com/sources
303http://some.where.com/dir/file.txt
304http://some.where.com/dir/a.txt
305http://foo.com/X.tgz
306ftp://foo.com/X.tgz
307http://foo/
308http://www.foo.com:8000/
309http://mysite.com/cgi-bin/log.cgi/http://www.some.other.site/args
310http://decoded/mirror/path
311http://a/b/c/d/e/f/g/h/i/j
312http://foo/bar.gz
313ftp://ftp.perl.org
314http://purl.org/rss/1.0/modules/taxonomy/
315ftp://ftp.sun.ac.za/CPAN/CPAN/
316ftp://ftp.cpan.org/pub/mirror/index.txt
317ftp://cpan.org/pub/mirror/index.txt
318http://example.com/~eh/
319http://plagger.org/.../rss
320http://umn.dl.sourceforge.net/mingw/gcc-g++-3.4.5-20060117-1.tar.gz
321http://umn.dl.sourceforge.net/mingw/binutils-2.16.91-20060119-1.tar.gz
322http://umn.dl.sourceforge.net/mingw/mingw-runtime-3.10.tar.gz
323http://umn.dl.sourceforge.net/mingw/gcc-core-3.4.5-20060117-1.tar.gz
324http://umn.dl.sourceforge.net/mingw/w32api-3.6.tar.gz
325http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
326http://module-build.sourceforge.net/META-spec-new.html
327http://module-build.sourceforge.net/META-spec-v1.4.html
328http://www.cs.vu.nl/~tmgil/vi.html
329http://perlcomposer.sourceforge.net/vperl.html
330http://www.perl.com/CPAN/authors/Tom_Christiansen/scripts/rep
331http://w4.lns.cornell.edu/~pvhp/ptk/ptkTOC.html
332http://world.std.com/~aep/ptkdb/
333http://www.castlelink.co.uk/object_system/
334http://www.fh-wedel.de/elvis/
335ftp://ftp.blarg.net/users/amol/zsh/
336ftp://ftp.funet.fi/pub/languages/perl/CPAN
337http://search.cpan.org/CPAN/authors/id/S/SH/SHAY/dmake-4.5-20060619-SHAY.zip
338http://users.perl5.git.perl.org/~USERNAME
339http://foo/x//y/script.cgi/a//b
340http://xxx/script.cgi/http://foo
341http://foo/./x//z/script.cgi/a/../b//c
342http://somewhere.else/in/movie/land
343http://somewhere.else/finished.html
344http://somewhere.else/bin/foo&bar$
345http://somewhere.else/
346http://proxy:8484/
347http://proxy/
348http://myrepo.example.com/
349http://remote/source
350https://example.com/
351http://example.com:1024/
352http:///path?foo=bar
353http://[::]:1024/
354http://([/
355http://example.com:9000/index.html
356http://proxy.example.com:8080/
357http:///index.html
358http://[www.json::pp.org]/
359http://localhost/
360http://foo.example.com/
361http://abc.com/a.js
362http://whatever/man/1/crontab
363http://abc.com/c.js
364http://whatever/Foo%3A%3ABar
365http://abc.com/b.js
366http://remote.server.com/jquery.css
367http://some.other.com/page.html
368https://text.com/1/2
369https://text.com/1/2
370http://link.included.here?o=1&p=2
371http://link.included.here?o=1&amp;p=2
372http://link.included.here?o=1&amp;p=2
373http://link.included.here/
374http://foo/x//y/script.cgi/a//b
375http://xxx/script.cgi/http://foo
376http://foo/./x//z/script.cgi/a/../b//c
377http://somewhere.else/in/movie/land
378http://somewhere.else/finished.html
379http://webproxy:3128/
380http://www/
381
382# these are used to generate or match URLs
383http://www.cpan.org/modules/by-authors/id/$1/$1$2/$dist
384http://www.cpantesters.org/show/%s.yaml
385ftp://(.*?)/(.*)/(.*
386ftp://(.*?)/(.*)/(.*
387ftp://(.*?)/(.*)/(.*
388ftp://ftp.foo.bar/
389http://$host/
390http://wwwe%3C46/
391ftp:/
392http://$addr/mark?commit=$
393http://search.cpan.org/~
394http:/
395ftp:%5Cn$url
396http://www.ietf.org/rfc/rfc$2.txt
397http://search.cpan.org/~
398ftp:%5Cn$url
399
400# weird redirects that LWP doesn't like
401http://www.theperlreview.com/community_calendar
402http://www.software.hp.com/portal/swdepot/displayProductInfo.do?productNumber=PERL
403http://sunsolve.sun.com
404
405# broken webserver that doesn't like HEAD requests
406http://docs.sun.com/app/docs/doc/816-5168/syslog-3c?a=view
407http://www.w3.org/TR/html4/loose.dtd
408
409# these have been reported upstream to CPAN authors
410http://www.gnu.org/manual/tar/html_node/tar_139.html
411http://www.w3.org/pub/WWW/TR/Wd-css-1.html
412http://support.microsoft.com/support/kb/articles/Q280/3/41.ASP
413http://msdn.microsoft.com/workshop/author/dhtml/httponly_cookies.asp
414http://search.cpan.org/search?query=Module::Build::Convert
415http://www.refcnt.org/papers/module-build-convert
416http://csrc.nist.gov/cryptval/shs.html
417http://msdn.microsoft.com/workshop/author/dhtml/reference/charsets/charset4.asp
418http://www.debian.or.jp/~kubota/unicode-symbols.html.en
419http://www.mail-archive.com/perl5-porters@perl.org/msg69766.html
420http://www.debian.or.jp/~kubota/unicode-symbols.html.en
421http://rfc.net/rfc2781.html
422http://www.icu-project.org/charset/
423http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html
424http://www.rfc-editor.org/
425http://www.rfc.net/
426http://www.oreilly.com/people/authors/lunde/cjk_inf.html
427http://www.oreilly.com/catalog/cjkvinfo/
428http://www.cpan.org/modules/by-author/Damian_Conway/Filter-Simple.tar.gz
429http://www.csse.monash.edu.au/~damian/CPAN/Filter-Simple.tar.gz
430http://www.egt.ie/standards/iso3166/iso3166-1-en.html
431http://www.bsi-global.com/iso4217currency
432http://www.plover.com/~mjd/perl/Memoize/
433http://www.plover.com/~mjd/perl/MiniMemoize/
434http://www.sysadminmag.com/tpj/issues/vol5_5/
435ftp://ftp.tpc.int/tpc/server/UNIX/
436http://www.nara.gov/genealogy/
437http://home.utah-inter.net/kinsearch/Soundex.html
438http://www.nara.gov/genealogy/soundex/soundex.html
439http://rfc.net/rfc3461.html
440ftp://ftp.cs.pdx.edu/pub/elvis/
441http://www.fh-wedel.de/elvis/
442http://lists.perl.org/list/perl-mvs.html
443http://www.cpan.org/ports/os2/
444http://github.com/dagolden/cpan-meta-spec
445http://github.com/dagolden/cpan-meta-spec/issues
446http://www.opensource.org/licenses/lgpl-license.phpt
447http://reality.sgi.com/ariel
448http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
449http://www.chiark.greenend.org.uk/pipermail/ukcrypto/1999-February/003538.html
450http://www.nsrl.nist.gov/testdata/
451http://public.activestate.com/cgi-bin/perlbrowse/p/31194
452http://public.activestate.com/cgi-bin/perlbrowse?patch=16173
453http://public.activestate.com/cgi-bin/perlbrowse?patch=16049
454http://www.li18nux.org/docs/html/CodesetAliasTable-V10.html
455http://aspn.activestate.com/ASPN/Mail/Message/perl5-porters/3486118
456http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.ut
457http://lxr.mozilla.org/seamonkey/source/intl/uconv/ucvlatin/vps.uf
458http://github.com/schwern/extutils-makemaker
459https://github.com/dagolden/cpanpm/compare/master...private%2Fuse-http-lite
460http://www.json.org/JSON::PP_checker/
461ftp://ftp.kiae.su/pub/unix/fido/
462http://www.gallistel.net/nparker/weather/code/
463http://www.javaworld.com/javaworld/jw-09-2001/jw-0928-ntmessages.html
464ftp://ftp-usa.tpc.int/pub/tpc/server/UNIX/
465http://www.cis.ohio-state.edu/htbin/rfc/rfc959.html
466http://public.activestate.com/cgi-bin/perlbrowse/p/33567
467http://public.activestate.com/cgi-bin/perlbrowse/p/33566
468http://www.dsmit.com/cons/
469http://www.makemaker.org/wiki/index.cgi?ModuleBuildConversionGuide
470
471__END__
472
473=head1 NAME
474
475checkURL.pl - Check that all the URLs in the Perl source are valid
476
477=head1 DESCRIPTION
478
479This program checks that all the URLs in the Perl source are valid. It
480checks HTTP and FTP links in parallel and contains a list of known
481bad example links in its source. It takes 4 minutes to run on my
482machine. The results are written to 'uris.txt' and list the filename,
483the URL and the error:
484
485  * ext/Locale-Maketext/lib/Locale/Maketext.pod
486    http://sunsite.dk/RFC/rfc/rfc2277.html
487      404 Not Found
488  ...
489
490It should be run every so often and links fixed and upstream authors
491notified.
492
493Note that the web is unstable and some websites are temporarily down.
494