Listing 1
#!/usr/bin/perl -w
# checklinks -- Check Hypertext
# Links on a Web Page
# Usage: See POD below
#------------------------------------
# Copyright (C) 1996 Jim Weirich.
# All rights reserved. Permission
# is granted for free use or
# modification.
#------------------------------------
use HTML::LinkExtor;
use HTTP::Request;
use LWP::UserAgent;
use LWP::Simple;
use URI::URL;
use Getopt::Std;
$version = '1.0';
# Usage
#-------------------------------------
# Display the usage message by scanning
# the POD documentation for the
# usage statement.
sub Usage {
while (<DATA>) {
if (/^B<[-A-Za-z0-9_.]+>/) {
s/[BI]<([^>]*)>/$1/g;
print "Usage: $_";
last;
}
}
exit 0;
}
# ManPage
#------------------------------------
# Display the man page by invoking the
# pod2man (or pod2text) script on
# self.
sub ManPage {
my($pager) = 'more';
$pager = $ENV{'PAGER'} if $ENV{'PAGER'};
if ($ENV{'TERM'} =~ /^(dumb|emacs)$/) {
system ("pod2text $0");
} else {
system ("pod2man $0 | nroff -man | $pager");
}
exit 0;
}
# HandleParsedLink
#---------------------------------
# HandleParsedLink is a callback
#provided for parsing handling HTML
# links found during parsing. $tag
# is the HTML tag where the link was
# found. %links is a hash that contains
# the keyword/value pairs from
# the link that contain URLs. For
# example, if an HTML anchor was
# found, the $tag would be "a"
# and %links would be (href=>"url").
# We check each URL in %links. We make
# sure the URL is absolute
# rather than relative. URLs that don't
# begin with "http:" or "file:"
# are ignored. Bookmarks following a "#"
# character are removed.
# If we have not seen this URL yet, we
# add it to the list of URLs to
# be checked. Finally, we note where
# the URL was found it its list of
# references.
sub HandleParsedLink {
my ($tag, %links) = @_;
for $url (values %links) {
my $urlobj = new URI::URL $url, $currentUrl;
$url = $urlobj->abs;
next if $url !~ /^(http|file):/;
$url =~ s/#.*$//;
if (!$refs{$url}) {
$refs{$url} = [];
push (@tobeChecked, $url);
}
push (@{$refs{$url}}, $currentUrl);
}
1;
}
# HandleDocChunk
#--------------------------------
# HandleDocChunk is called by the
# UserAgent as the web document is
# fetched. As each chunk of the
# document is retrieved, it is passed
# to the HTML parser object for further
# processing (which in this
# case, means extracting the links).
sub HandleDocChunk {
my ($data, $response, $protocol) = @_;
$parser->parse ($data);
}
# ScanUrl
# ------------------------------
# We have a URL that needs to be
# scanned for further references to
# other URLs. We create a request to
# fetch the document and give that
# request to the UserAgent responsible
# for doing the fetch.
sub ScanUrl {
my($url) = @_;
$currentUrl = $url;
push (@isScanned, $url);
print "Scanning $url\n";
$request = new HTTP::Request (GET => $url);
$response = $agent->request \
($request, \&HandleDocChunk);
if ($response-7gt;is_error) {
die "Can't Fetch URL $url\n";
}
$parser->eof;
}
# CheckUrl
# ------------------------------
# We have a URL that needs to be
# checked and validated. We attempt
# to get the header of the document
# using the head() function. If this
# fails, we add the URL to our list
# of bad URLs. If we do get the
# header, the URL is added to our
# good URL list. If the good URL
# is part of our local web site
#(i.e. it begins with the local
# prefix), then we want to scan
# this URL for more references.
sub CheckUrl {
my($url) = @_;
print " Checking $url\n" if $verbose;
if (!head ($url)) {
push (@badUrls, $url);
} else {
push (@goodUrls, $url);
if ($doRecurse && $url =~ /\.html?/ \
&& $url =~ /^$localprefix/) {
push (@tobeScanned, $url);
}
}
}
# Main Program
#---------------------------------
use vars qw ($opt_h $opt_H $opt_V);
getopts('hHpruvV') || die "Command aborted.\n";
$verbose = ($opt_v ? $opt_v : 0);
$printUrls = ($opt_u ? $opt_u : 0);
$doRecurse = ($opt_r ? $opt_r : 0);
die "Version $version\n" if $opt_V;
ManPage() if $opt_H;
Usage() if $opt_h || @ARGV==0;
# Initialize our bookkeeping arrays
@tobeScanned = ();
# list of URLs to be scanned
@goodUrls = ();
# list of good URLs
@badUrls = ();
# list of bad URLs
@isScanned = ();
# list of scanned URLs
%refs = ();
# reference lists
# Use the first URL as the model
# for the local prefix. We remove the
# trailing file name of the URL and
# retain the prefix. Any URL that
# begins with this prefix will be
#considered a local URL and available
# for further scanning.
$localprefix = ($opt_p ? $opt_p : $ARGV[0]);
$localprefix =~ s%[^/]*$%%;
print "Local Prefix = $localprefix\n" if $verbose;
if ($doRecurse && !$localprefix) {
die "A local prefix is required i\
to restrict recursive fetching\n";
}
# Put each command line arg on the
# list of files to scan. If the
# argument is a file name, convert
# it to a URL by prepending a "file:"
# to it.
for $arg (@ARGV) {
if (-e $arg) {
$arg = "file:" . $arg;
}
push (@tobeScanned, $arg);
}
# Create the global parser and
# user agent.
$parser = new HTML::LinkExtor
(\&HandleParsedLink);
$agent = new LWP::UserAgent;
# Keep Scanning and Checking until
# there are no more URLs
while (@tobeScanned || @tobeChecked) {
while (@tobeChecked) {
my $url = shift @tobeChecked;
CheckUrl ($url);
}
if (@tobeScanned) {
my $url = shift @tobeScanned;
ScanUrl ($url);
}
}
# Print the results.
if ($printUrls) {
print "Scanned URLs: ", join (" ",
sort @isScanned), "\n";
print "\n";
print "Good URLs: ", join (" ",
sort @goodUrls), "\n";
print "\n";
print "Bad URLs: ", join (" ",
sort @badUrls), "\n";
}
print "\n";
for $url (sort @badUrls) {
print "BAD URL $url referenced in ...\n";
for $ref (sort @{$refs{$url}}) {
print "... $ref\n";
}
print "\n";
}
print int (@isScanned), " URLs Scanned\n";
print int (keys %refs), " URLs checked\n";
print int (@goodUrls), " good URLs found\n";
print int (@badUrls), " bad URLs found\n";
__END__
=head1 NAME
checklinks - Check Hypertext
Links on a Web Page
=head1 SYNOPSIS
B<checklinks> [B<-hHpruvV>] I<urls>...
=head1 DESCRIPTION
I<checklinks> will scan a web site
for bad HTML links.
=head1 OPTIONS
=over 6
=item B<-h> (help)
Display a usage message.
=item B<-H> (HELP ... man page)
Display the man page.
=item B<-p> I<prefix> (local prefix)
Specify the local prefix to be used
when testing for local URLs. If
this option is not specified when
using the B<-r> option, then a local
prefix is calculated from the first URL
on the command line.
=item B<-r> (recurse)
Normally, only the URLs listed on the
command line are scanned. If
this option is specified, local URLs
(as defined by the local prefix)
found within documents are fetched and scanned.
=item B<-u> (print URL lists)
The complete lists of good, bad and
scanned URLs will be printed in
addition to the normally printed information.
=item B<-v> (verbose mode)
Display "Checking" messages
as well as "Scanning" messaegs.
=item I<urls>
List of urls to be scanned. If the URLs
is a filename, then a "file:"
is prepended to the filename (this allows
local files to be scanned
like other URLs).
=back
=head1 AUTHOR
Jim Weirich <C<jweirich@one.net>>
=head1 LIMITATIONS
When recursive scanning URLs
option B<-r>), a local prefix is
calculated from the first URL on the
command line by removing the last
file name in the URL path. If the
URL specifies a directory, the
calculated prefix may be incorrect.
Always specify the complete URL
or use the B<-p> prefix
option to directly specify a local prefix.
=head1 SEE ALSO
See also related man pages for
HTML::LinkExtor, HTTP::Request,
LWP::UserAgent, LWP::Simple, and URI::URL.
=cut