Using Perl to Check Web Links

Listing 1

#!/usr/bin/perl -w
# checklinks -- Check Hypertext
# Links on a Web Page
# Usage:  See POD below

#------------------------------------
# Copyright (C) 1996  Jim Weirich.
# All rights reserved. Permission
# is granted for free use or
# modification.
#------------------------------------

use HTML::LinkExtor;
use HTTP::Request;
use LWP::UserAgent;
use LWP::Simple;
use URI::URL;

use Getopt::Std;
$version = '1.0';

# Usage
#-------------------------------------
# Display the usage message by scanning
# the POD documentation for the
# usage statement.

sub Usage {
    while (<DATA>) {
   if (/^B<[-A-Za-z0-9_.]+>/) {
       s/[BI]<([^>]*)>/$1/g;
       print "Usage: $_";
       last;
   }
    }
    exit 0;
}


# ManPage
#------------------------------------
# Display the man page by invoking the
# pod2man (or pod2text) script on
# self.

sub ManPage {
    my($pager) = 'more';
    $pager = $ENV{'PAGER'} if $ENV{'PAGER'};
    if ($ENV{'TERM'} =~ /^(dumb|emacs)$/) {
   system ("pod2text $0");
    } else {
   system ("pod2man $0 | nroff -man | $pager");
    }
    exit 0;
}


# HandleParsedLink
#---------------------------------
# HandleParsedLink is a callback
#provided for parsing handling HTML
# links found during parsing.  $tag
# is the HTML tag where the link was
# found. %links is a hash that contains
# the keyword/value pairs from
# the link that contain URLs. For
# example, if an HTML anchor was
# found, the $tag would be "a"
# and %links would be (href=>"url").

# We check each URL in %links. We make
# sure the URL is absolute
# rather than relative. URLs that don't
# begin with "http:" or "file:"
# are ignored. Bookmarks following a "#"
# character are removed.
# If we have not seen this URL yet, we
# add it to the list of URLs to
# be checked. Finally, we note where
# the URL was found it its list of
# references.

sub HandleParsedLink {
      my ($tag, %links) = @_;
      for $url (values %links) {
         my $urlobj = new URI::URL $url, $currentUrl;
           $url = $urlobj->abs;
           next if $url !~ /^(http|file):/;
     $url =~ s/#.*$//;
     if (!$refs{$url}) {
         $refs{$url} = [];
         push (@tobeChecked, $url);
   }
     push (@{$refs{$url}}, $currentUrl);
    }
    1;
}

# HandleDocChunk
#--------------------------------
# HandleDocChunk is called by the
# UserAgent as the web document is
# fetched. As each chunk of the
# document is retrieved, it is passed
# to the HTML parser object for further
# processing (which in this
# case, means extracting the links).

sub HandleDocChunk {
    my ($data, $response, $protocol) = @_;
    $parser->parse ($data);
}


# ScanUrl
# ------------------------------
# We have a URL that needs to be
# scanned for further references to
# other URLs. We create a request to
# fetch the document and give that
# request to the UserAgent responsible
# for doing the fetch.

sub ScanUrl {
    my($url) = @_;
    $currentUrl = $url;
    push (@isScanned, $url);
    print "Scanning $url\n";
    $request  = new HTTP::Request (GET => $url);
    $response = $agent->request \
    ($request, \&HandleDocChunk);
    if ($response-7gt;is_error) {
   die "Can't Fetch URL $url\n";
    }
    $parser->eof;
}

# CheckUrl
# ------------------------------
# We have a URL that needs to be
# checked and validated. We attempt
# to get the header of the document
# using the head() function. If this
# fails, we add the URL to our list
# of bad URLs. If we do get the
# header, the URL is added to our
# good URL list. If the good URL
# is part of our local web site
#(i.e. it begins with the local
# prefix), then we want to scan
# this URL for more references.

sub CheckUrl {
    my($url) = @_;
    print "    Checking $url\n" if $verbose;
    if (!head ($url)) {
           push (@badUrls, $url);
    } else {
      push (@goodUrls, $url);
      if ($doRecurse && $url =~ /\.html?/ \
              && $url =~ /^$localprefix/) {
            push (@tobeScanned, $url);
   }
    }
}

# Main Program
#---------------------------------

use vars qw ($opt_h $opt_H $opt_V);

getopts('hHpruvV') || die "Command aborted.\n";
$verbose   = ($opt_v ? $opt_v : 0);
$printUrls = ($opt_u ? $opt_u : 0);
$doRecurse = ($opt_r ? $opt_r : 0);

die "Version $version\n" if $opt_V;
ManPage() if $opt_H;
Usage() if $opt_h || @ARGV==0;

# Initialize our bookkeeping arrays

@tobeScanned = ();
# list of URLs to be scanned
@goodUrls    = ();
# list of good URLs
@badUrls     = ();
# list of bad URLs
@isScanned   = ();
# list of scanned URLs
%refs        = ();
# reference lists

# Use the first URL as the model
# for the local prefix. We remove the
# trailing file name of the URL and
# retain the prefix. Any URL that
# begins with this prefix will be
#considered a local URL and available
# for further scanning.

$localprefix = ($opt_p ? $opt_p : $ARGV[0]);
$localprefix =~ s%[^/]*$%%;
print "Local Prefix = $localprefix\n" if $verbose;
if ($doRecurse && !$localprefix) {
    die "A local prefix is required i\
       to restrict recursive fetching\n";
}

# Put each command line arg on the
# list of files to scan. If the
# argument is a file name, convert
# it to a URL by prepending a "file:"
# to it.

for $arg (@ARGV) {
    if (-e $arg) {
   $arg = "file:" . $arg;
    }
    push (@tobeScanned, $arg);
}

# Create the global parser and
# user agent.

$parser = new HTML::LinkExtor
   (\&HandleParsedLink);
$agent  = new LWP::UserAgent;

# Keep Scanning and Checking until
# there are no more URLs

while (@tobeScanned || @tobeChecked) {
    while (@tobeChecked) {
   my $url = shift @tobeChecked;
   CheckUrl ($url);
    }

    if (@tobeScanned) {
   my $url = shift @tobeScanned;
   ScanUrl ($url);
    }
}

# Print the results.

if ($printUrls) {
    print "Scanned URLs: ", join (" ",
        sort @isScanned), "\n";
    print "\n";
    print "Good URLs: ", join (" ",
        sort @goodUrls), "\n";
    print "\n";
    print "Bad URLs: ", join (" ",
        sort @badUrls), "\n";
}

print "\n";
for $url (sort @badUrls) {
    print "BAD URL $url referenced in ...\n";
    for $ref (sort @{$refs{$url}}) {
   print "... $ref\n";
    }
    print "\n";
}

print int (@isScanned), " URLs Scanned\n";
print int (keys %refs), " URLs checked\n";
print int (@goodUrls), " good URLs found\n";
print int (@badUrls),  " bad  URLs found\n";

__END__

=head1 NAME

checklinks - Check Hypertext
 Links on a Web Page

=head1 SYNOPSIS

B<checklinks> [B<-hHpruvV>] I<urls>...

=head1 DESCRIPTION

I<checklinks> will scan a web site
 for bad HTML links.

=head1 OPTIONS

=over 6

=item B<-h> (help)

Display a usage message.

=item B<-H> (HELP ... man page)

Display the man page.

=item B<-p> I<prefix> (local prefix)

Specify the local prefix to be used
when testing for local URLs.  If
this option is not specified when
using the B<-r> option, then a local
prefix is calculated from the first URL
 on the command line.

=item B<-r> (recurse)

Normally, only the URLs listed on the
 command line are scanned.  If
this option is specified, local URLs
 (as defined by the local prefix)
found within documents are fetched and scanned.

=item B<-u> (print URL lists)

The complete lists of good, bad and
scanned URLs will be printed in
addition to the normally printed information.

=item B<-v> (verbose mode)

Display "Checking" messages
as well as "Scanning" messaegs.

=item I<urls>

List of urls to be scanned. If the URLs
is a filename, then a "file:"
is prepended to the filename (this allows
 local files to be scanned
like other URLs).

=back

=head1 AUTHOR

Jim Weirich <C<jweirich@one.net>>

=head1 LIMITATIONS

When recursive scanning URLs
option B<-r>), a local prefix is
calculated from the first URL on the
command line by removing the last
file name in the URL path. If the
URL specifies a directory, the
calculated prefix may be incorrect.
Always specify the complete URL
or use the B<-p> prefix
option to directly specify a local prefix.

=head1 SEE ALSO

See also related man pages for
HTML::LinkExtor, HTTP::Request,
LWP::UserAgent, LWP::Simple, and URI::URL.

=cut