#!/usr/bin/perl -w
#################
#
# findlinks
#
# Jim Mahoney (mahoney@marlboro.edu), Oct 2001
# Exercise in regular expressions for MSIE class to
# find and print the links in a given html file.
#
# The documentation is at the end of this file in POD format.
# To read it, type "perldoc findlinks" at the shell prompt.
# Or, to create an html version of the documentation,
# type "pod2html findlinks > findlinks.html".
# To read more about POD (Plain Old Documentation),
# type "perldoc perlpod". Or just STFW (Search The $!* Web).
#
################
use 5.006;
use strict;
use warnings;
# Complain to the user if not called properly.
unless (@ARGV==1) {
die " Wrong number of arguments. Usage: $0 file.html \n";
}
# Tell the nice user what we're up to.
my $filename = $ARGV[0];
print "Analyzing '$filename' for links...\n";
# Get all the text to be processed into a single scalar variable,
# and remove any carriage returns so we don't have to worry about
# missing links that extend across multiple lines.
# (As Rich pointed out in class, we need to do a bit more work
# we're going to eliminate the newlines. I'm using a substitute
# here rather than chomp.)
my @lines = <>; # Slurp in all lines,
my $htmltext = join(" ",@lines); # and join 'em up,
$htmltext =~ s/\n//g; # and remove the new lines.
# Now find the links.
# This version looks for a link patter -link- as either
# .
# First we define $lre (link regular expression) to match the
# link itself, and then use that in a larger regexp called
# in a list context in a global search which returns all the matches.
my $lre = qr{[^'"\s>]+}; #']};# link regular expression
my @links =
$htmltext =~ m{ # match
findlinks someFile.html
=head1 DESCRIPTION
Prints a list of the links found in an html file.
This program is a class exercise for the the IPL class
with the MSIE program at Marlboro College's Graduate Center.
=head1 AUTHOR
Jim Mahoney (mahoney@marlboro.edu)
=head1 COPYRIGHT
This software may be copied under the terms
of the Artistic License. See
L
or the file F.
=head1 SEE ALSO
L, L
=cut