#!/usr/bin/perl -w
####################
#
# A number of examples of regular expression and their uses.
# See the camel book, chapter 5 for all the details.
#
###################
use strict;
print "Content-type: text/plain\n\n";
print "
***********************
* Some regex examples *
***********************
";
# Define some text.
my $string = " This is some stuff : 123, balloon, 01234, bookkeeper ";
print "
Start with string
'$string'
";
# Substitue first instance of 123 for 456 in a copy of that text.
(my $new = $string) =~ s/123/456/;
print "
Apply s/123/456/ to get
'$new'
";
# Set the $_ variable. Look for digits.
# In scalar context, return 1 or 0 for success or failure.
print '
Looking for /(\d+)/
';
for ($new) {
my $boolean = /(\d+)/;
print " in scalar context : $boolean \n";
}
# Look for consecutive digits in the string
# In scalar context, "1" or "0" is returned for true or false.
# In array context, we return a list of the successful matches.
# The first one gets put in the first element of the list.
# Here are two forms similar forms.
(my $firstmatch) = ( $new =~ m/(\d+)/ ); # return first match only
my ($first, $second) = $new =~ m/(\d+)/g; # note the "g" for global
print " firstmatch in array context : $firstmatch \n";
print " first, second matches in array context : $first $second \n";
#-----------------------------------------------
# Let's look for words with two doubled characters, like "garfoopper" .
#
# Here's dense version :
# my @allmatches = $string =~ m|(\w*(.)\2(.)\3\w*)|g ;
#
# You might think that "bookkeeper" would be found twice
# since there are two pair of doubled characters "ookk" and "kkee",
# but the regex engine doesn't backtrack by default, so it finds
# "ookk" and then keeps going, looking at "eepe" next.
#
# Using the /x (extended) option, we can spread this out to make
# it a bit more readable.
print "
Now let's look for consecutive doubles, like aabb.
";
print '
Dense version of regexp is m|(\w*(.)\2(.)\3\w*)|g
';
my @allmatches = (
$string =~ m|
( # start remembering \1
\w* # beginning of word
(.)\2 # any char followed by itself
(.)\3 # a second doubled character
\w* # rest of word
) # end of word \1
|gx # look everywhere; use extended syntax
);
print " All the matches from regexp = " , join("|", @allmatches), "\n";
# If we only wanted the words, we could print
# just every third entry in @allmatches.
while (@allmatches) {
print " A word with two double chars : $allmatches[0] \n";
shift @allmatches for 1..3;
}
# --- what follows is from oct 2000 ---
print "\n\n\n";
# Define some more text for us to play with.
my $text = <<"END_OF_TEXT";
This is the text that the following operations will be applied to.
Here is a url: http://www.marlboro.edu. And here is a word that
has *stars* around it, and one with _underlines_.
Some of our names are in here: Jim and Brandt.
END_OF_TEXT
print "The text we're going to look at now is \n";
displayText($text);
print "\n Look for names: \n";
foreach my $name ( qw( Jim Mark Sue ) ) {
if ( $text =~ /$name/ ) {
print " Yes, '$name' is in that text. \n"; }
else {
print " No, '$name' isn't in that text. \n"; }
}
print "\n A variation: \n";
foreach my $name ( qw( brandt george ) ) {
if ( $text =~ m{$name}i ) { # used { } after "m" , and used i (ignore case) flag.
print " Yes, '$name' (ignore case) is in that text. \n"; }
else {
print " No, '$name' (ignore case) isn't in that text. \n"; }
}
print "\n Switch the first two words of each line. \n";
# Here we are using {} and [] for the two sections of s/ / /.
# Notes:
# * \S is a non-whitespace char. \s is a whitespace char.
# * + is "1 or more times". The ()'s capture into $1 and $2.
# * The flags are "g" for global and "m" for multi-line, so ^ matches each new line.
# * Idiomatic form for copying $text and then doing the substitution
# * Side effect: whitespace at front of line is removed.
(my $new_text = $text) =~ s/^\s+(\S+)\s+(\S+)/$2 $1/gm;
displayText($new_text);
print "\n Clearer version, preserving exact whitespace.
(But note that this word definition does not treat *stars* as a word.) \n";
my $word = '\w+'; # Don't use "\w+" here, since that will make your string 'w+'
my $space = '\s+';
my $leading = '\s*';
($new_text = $text)
=~ s{ ^ # start of line
( $leading ) # 1 - optional
( $word ) # 2 - first word
( $space ) # 3 - intervening true space
( $word ) # 4 - second word
}
{$1$4$3$2}gmx; # Replacement - global, multi-line, extended.
displayText($new_text);
print "\n Yet another way, using the qr (quote regex) operator : \n";
my $regexp = qr{ ^
( $leading )
( \w+ )
( \s+ )
( \w+ )
}x;
($new_text = $text) =~ s/$regexp/$1$4$3$2/gm;
displayText($new_text);
print "\n Find and print all occurances of three letter words: \n";
print( join(" | ", $text =~ m{\b\w{3}\b}g ) , "\n");
print "\n Convert *word* and _word_ to html bold and underline \n";
# This is a common idiom to set $_ to $text so we can do things to it.
# even though we aren't really looping.
for ($new_text = $text) {
s {\*(\w+)\*} {$1}g;
s {_(\w+)_} {$1}g;
}
displayText($new_text);
# === subroutines =========================================
# Print given text with dashed line before and after.
sub displayText {
my ($stuff) = @_;
print "- " x 20, " start text - - - \n";
print $stuff;
print "- " x 20, " end text - - - - \n";
}