#!/usr/bin/perl -w #################### # # A number of examples of regular expression and their uses. # See the camel book, chapter 5 for all the details. # ################### use strict; print "Content-type: text/plain\n\n"; print " *********************** * Some regex examples * *********************** "; # Define some text. my $string = " This is some stuff : 123, balloon, 01234, bookkeeper "; print " Start with string '$string' "; # Substitue first instance of 123 for 456 in a copy of that text. (my $new = $string) =~ s/123/456/; print " Apply s/123/456/ to get '$new' "; # Set the $_ variable. Look for digits. # In scalar context, return 1 or 0 for success or failure. print ' Looking for /(\d+)/ '; for ($new) { my $boolean = /(\d+)/; print " in scalar context : $boolean \n"; } # Look for consecutive digits in the string # In scalar context, "1" or "0" is returned for true or false. # In array context, we return a list of the successful matches. # The first one gets put in the first element of the list. # Here are two forms similar forms. (my $firstmatch) = ( $new =~ m/(\d+)/ ); # return first match only my ($first, $second) = $new =~ m/(\d+)/g; # note the "g" for global print " firstmatch in array context : $firstmatch \n"; print " first, second matches in array context : $first $second \n"; #----------------------------------------------- # Let's look for words with two doubled characters, like "garfoopper" . # # Here's dense version : # my @allmatches = $string =~ m|(\w*(.)\2(.)\3\w*)|g ; # # You might think that "bookkeeper" would be found twice # since there are two pair of doubled characters "ookk" and "kkee", # but the regex engine doesn't backtrack by default, so it finds # "ookk" and then keeps going, looking at "eepe" next. # # Using the /x (extended) option, we can spread this out to make # it a bit more readable. print " Now let's look for consecutive doubles, like aabb. "; print ' Dense version of regexp is m|(\w*(.)\2(.)\3\w*)|g '; my @allmatches = ( $string =~ m| ( # start remembering \1 \w* # beginning of word (.)\2 # any char followed by itself (.)\3 # a second doubled character \w* # rest of word ) # end of word \1 |gx # look everywhere; use extended syntax ); print " All the matches from regexp = " , join("|", @allmatches), "\n"; # If we only wanted the words, we could print # just every third entry in @allmatches. while (@allmatches) { print " A word with two double chars : $allmatches[0] \n"; shift @allmatches for 1..3; } # --- what follows is from oct 2000 --- print "\n\n\n"; # Define some more text for us to play with. my $text = <<"END_OF_TEXT"; This is the text that the following operations will be applied to. Here is a url: http://www.marlboro.edu. And here is a word that has *stars* around it, and one with _underlines_. Some of our names are in here: Jim and Brandt. END_OF_TEXT print "The text we're going to look at now is \n"; displayText($text); print "\n Look for names: \n"; foreach my $name ( qw( Jim Mark Sue ) ) { if ( $text =~ /$name/ ) { print " Yes, '$name' is in that text. \n"; } else { print " No, '$name' isn't in that text. \n"; } } print "\n A variation: \n"; foreach my $name ( qw( brandt george ) ) { if ( $text =~ m{$name}i ) { # used { } after "m" , and used i (ignore case) flag. print " Yes, '$name' (ignore case) is in that text. \n"; } else { print " No, '$name' (ignore case) isn't in that text. \n"; } } print "\n Switch the first two words of each line. \n"; # Here we are using {} and [] for the two sections of s/ / /. # Notes: # * \S is a non-whitespace char. \s is a whitespace char. # * + is "1 or more times". The ()'s capture into $1 and $2. # * The flags are "g" for global and "m" for multi-line, so ^ matches each new line. # * Idiomatic form for copying $text and then doing the substitution # * Side effect: whitespace at front of line is removed. (my $new_text = $text) =~ s/^\s+(\S+)\s+(\S+)/$2 $1/gm; displayText($new_text); print "\n Clearer version, preserving exact whitespace. (But note that this word definition does not treat *stars* as a word.) \n"; my $word = '\w+'; # Don't use "\w+" here, since that will make your string 'w+' my $space = '\s+'; my $leading = '\s*'; ($new_text = $text) =~ s{ ^ # start of line ( $leading ) # 1 - optional ( $word ) # 2 - first word ( $space ) # 3 - intervening true space ( $word ) # 4 - second word } {$1$4$3$2}gmx; # Replacement - global, multi-line, extended. displayText($new_text); print "\n Yet another way, using the qr (quote regex) operator : \n"; my $regexp = qr{ ^ ( $leading ) ( \w+ ) ( \s+ ) ( \w+ ) }x; ($new_text = $text) =~ s/$regexp/$1$4$3$2/gm; displayText($new_text); print "\n Find and print all occurances of three letter words: \n"; print( join(" | ", $text =~ m{\b\w{3}\b}g ) , "\n"); print "\n Convert *word* and _word_ to html bold and underline \n"; # This is a common idiom to set $_ to $text so we can do things to it. # even though we aren't really looping. for ($new_text = $text) { s {\*(\w+)\*} {$1}g; s {_(\w+)_} {$1}g; } displayText($new_text); # === subroutines ========================================= # Print given text with dashed line before and after. sub displayText { my ($stuff) = @_; print "- " x 20, " start text - - - \n"; print $stuff; print "- " x 20, " end text - - - - \n"; }