#!/usr/bin/perl -w ################# # # findlinks # # Jim Mahoney (mahoney@marlboro.edu), Oct 2001 # Exercise in regular expressions for MSIE class to # find and print the links in a given html file. # # The documentation is at the end of this file in POD format. # To read it, type "perldoc findlinks" at the shell prompt. # Or, to create an html version of the documentation, # type "pod2html findlinks > findlinks.html". # To read more about POD (Plain Old Documentation), # type "perldoc perlpod". Or just STFW (Search The $!* Web). # ################ use 5.006; use strict; use warnings; # Complain to the user if not called properly. unless (@ARGV==1) { die " Wrong number of arguments. Usage: $0 file.html \n"; } # Tell the nice user what we're up to. my $filename = $ARGV[0]; print "Analyzing '$filename' for links...\n"; # Get all the text to be processed into a single scalar variable, # and remove any carriage returns so we don't have to worry about # missing links that extend across multiple lines. # (As Rich pointed out in class, we need to do a bit more work # we're going to eliminate the newlines. I'm using a substitute # here rather than chomp.) my @lines = <>; # Slurp in all lines, my $htmltext = join(" ",@lines); # and join 'em up, $htmltext =~ s/\n//g; # and remove the new lines. # Now find the links. # This version looks for a link patter -link- as either # . # First we define $lre (link regular expression) to match the # link itself, and then use that in a larger regexp called # in a list context in a global search which returns all the matches. my $lre = qr{[^'"\s>]+}; #']};# link regular expression my @links = $htmltext =~ m{ # match findlinks someFile.html =head1 DESCRIPTION Prints a list of the links found in an html file. This program is a class exercise for the the IPL class with the MSIE program at Marlboro College's Graduate Center. =head1 AUTHOR Jim Mahoney (mahoney@marlboro.edu) =head1 COPYRIGHT This software may be copied under the terms of the Artistic License. See L or the file F. =head1 SEE ALSO L, L =cut