#! /usr/bin/perl -w # This Perl script implements Phil Hazel's Regular Expression Exercises, an # interative application to test people's understanding of regular expressions. # Implementation started: 07-Dec-2006 # # Philip Hazel # University of Cambridge Computing Service ################################################## # Script initialization # ################################################## # This script needs the Term::ReadLine, Term::ReadKey, and POSIX modules to be # installed, but otherwise should be freestanding. @required_modules = ( "Term::ReadLine", "Term::ReadKey", "Time::HiRes", "POSIX" ); foreach $req (@required_modules) { eval "use $req;"; if ($@) { print STDERR "** The Perl module \"$req\" is not installed. ". "The phreex script needs\n** the following modules:\n"; foreach $r (@required_modules) { print STDERR " $r\n"; } exit 1; } } # Set up ReadLine functionality $term = new Term::ReadLine 'phreex'; $OUT = $term->OUT || \*STDOUT; # Get the POSIX signal functionality and set the handler use POSIX ':signal_h'; sigaction SIGALRM, new POSIX::SigAction sub { die "Die SIGALRM\n"; } or die "Error setting SIGALRM handler: $!\n"; # Arrange for time() to give high resolution use Time::HiRes 'time'; ################################################## # Read all the data from below the script # ################################################## # Lines that start with a dot are always ignored. sub initialize_data{ $ecount = 0; while () { s/\s+$//; next if /^\./; if (/^>>INTRO>>$/) { die "Internal error: INTRO defined twice\n " if defined $intro; $intro = ""; while () { next if /^\./; last if /^>>/; $intro .= $_; } } if (/^>>PROMPTS>>$/) { die "Internal error: PROMPTS defined twice\n" if defined $prompt[0]; my $pno = 0; while () { s/\s+$//; next if /^\./; last if /^>>/; $prompt[$pno++] = "$_ "; } die "Internal error: missing PROMPT strings\n" if $pno < $prompts_required; } if (/^>>EX (.*)>>$/) { my($n) = $ecount++; $title[$n] = $1; $rubric[$n] = ""; while () { next if /^\./; last if /^>>/; $rubric[$n] .= $_; } s/\s+$//; if (/^>>YES>>$/) { die "Internal error: YES found twice for exercise $n\n" if defined $yes[$n][0]; my $m = 0; while () { s/\s+$//; s/"/\\"/g; # Escape double quotes $_ = eval "\"$_\""; # Process as Perl string next if /^\./; last if /^>>/; if (/^\+(\d+)\s(.*)/s) { $yescap[$n][$m]{$1} = $2; } else { $yes[$n][$m++] = $_; } } } if (/^>>NO>>$/) { die "Internal error: NO found twice for exercise $n\n" if defined $no[$n][0]; my $m = 0; while () { s/\s+$//; s/"/\\"/g; # Escape double quotes $_ = eval "\"$_\""; # Process as Perl string next if /^\./; last if /^>>/; $no[$n][$m++] = $_; } } if (/^>>ANSWERS>>$/) { die "Internal error: ANSWERS found twice for exercise $n\n" if defined $answers[$n][0]; my $m = 0; while () { s/\s+$//; next if /^\./; last if /^>>/; $answers[$n][$m++] = $_; } } if (/^>>TIME (\d+)>>$/) { die "Internal error: TIME found twice for exercise $n\n" if defined $timecount[$n]; $timecount[$n] = $1; while () { s/\s+$//; next if /^\./; die "Internal error: unrecognized line after TIME for exercise $n\n" if !/^>>/; last; } } # End of exercise; if not at EOF restart the outer loop redo if defined $_; } return if (/^>>END>>$/); die "Internal error: invalid DATA line: $_\n"; } } ################################################## # Make a string printable # ################################################## sub printable { my($s) = $_[0]; $s =~ s/\n/\\n/g; $s =~ s/([\x00-\x1f])/sprintf("\\x%0.2x", ord $1)/eg; return $s; } ################################################## # Run a series of timing tests # ################################################## sub time_it { my($re, $mre, $count, @s) = @_; my($i, $j, $starttime, $endtime, $yourtime, $myre, $mytime); for ($i = 0; $i < @s; $i++) { my($ss) = printable($s[$i]); print $OUT "\n \e[1m$ss\e[m\n"; $starttime = time(); for ($j = 0; $j < $count; $j++) { $s[$i] =~ /$re/; } $endtime = time(); $yourtime = ($endtime - $starttime) * 10000; printf $OUT "Your time: %4.0f\n", $yourtime; if (defined $mre) { $starttime = time(); for ($j = 0; $j < $count; $j++) { $s[$i] =~ /$mre/; } $endtime = time(); $mytime = ($endtime - $starttime) * 10000; printf $OUT " My time: %4.0f\n", $mytime; } } } ################################################## # Run an exercise # ################################################## sub exercise { my $en = $_[0]; my ($i, $re, $s, @n, @y, %c); print $OUT "\nExercise $en: $title[$en]\n\n"; print $OUT $rubric[$en]; print $OUT "\n"; MAINLOOP: for (;;) { my($notime) = 0; $_ = $term->readline("Enter regex: "); return if !defined $_; if (/^\?$/) { if (! defined $answers[$en]) { print $OUT "There are no suggested answers to this exercise.\n"; } else { my(@a) = @{ $answers[$en] }; if (@a > 1) { printf $OUT "There are %d suggested answers:\n\n", scalar(@a); for ($i = 1; $i <= @a; $i++) { printf $OUT "$i: $answers[$en][$i-1]\n\n"; } } else { print $OUT "Suggested answer:\n"; print $OUT "$answers[$en][0]\n\n"; } } next; } if (/^\?(\d+)$/) { my($n) = $1; if ($n == 0 || !defined $answers[$en][$n-1]) { print $OUT "Suggested answer $n does not exist.\n"; next; } else { $_ = $answers[$en][$n-1]; print $OUT "\e[1m$_\e[m\n"; $term->addhistory($_); $notime = 1 if $n == 1; } } if (/^\*(.*)/) { my($filename) = $1; if (!open IN, $filename) { print $OUT "** Failed to open $filename: $!\n"; next; } else { undef $/; $_ = ; s/\s+$//; print $OUT "$_\n"; close IN; $/ = "\n"; } } eval { $re = qr{$_} }; if ($@) { $@ =~ s/at \S+ line \d+, line \d+\.$//; print $OUT "** Syntax error in pattern:\n"; print $OUT "$@\n"; next; } @y = @{ $yes[$en] }; @n = (defined $no[$en])? @{ $no[$en] } : (); print $OUT "\nTesting strings that should match:\n"; for ($i = 0; $i < @y; $i++) { $s = printable($y[$i]); print $OUT " \e[1m$s\e[m\n"; alarm(5); eval { $y[$i] =~ /$re/; }; alarm(0); if ($@ ne "") { print $OUT "** Your pattern is taking too long to match this string\n"; print $OUT "** Please try to improve it\n\n"; next MAINLOOP; } # Re-evaluate because captured info may have been messed up if ($y[$i] =~ /$re/) { if (defined $yescap[$en][$i]) { %c = %{ $yescap[$en][$i] }; foreach $k (sort keys %c) { my($value) = (defined $-[$k])? (substr $y[$i], $-[$k], $+[$k] - $-[$k]) : ""; if ($value ne $c{$k}) { my($pn) = ($k == 0)? "matched" : ("captured by group " . (($k =~ /^\d+$/)? "$k" : "'$k'")); print $OUT "** Incorrect value $pn\n"; $s = printable($c{$k}); print $OUT "** Expected: \e[1m$s\e[m\n"; $s = printable($value); print $OUT "** Found: \e[1m$s\e[m\n\n"; next MAINLOOP; } } } } else { print $OUT "** Match failed\n\n"; next MAINLOOP; } } if (@n > 0) { print $OUT "\nTesting strings that should not match:\n"; for ($i = 0; $i < @n; $i++) { $s = printable($n[$i]); print $OUT " \e[1m$s\e[m\n"; alarm(5); eval { $n[$i] =~ /$re/; }; alarm(0); if ($@ ne "") { print $OUT "** Your pattern is taking too long to match this string\n"; print $OUT "** Please try to improve it\n\n"; next MAINLOOP; } if ($n[$i] =~ /$re/) { print $OUT "** Match succeeded when it should not\n\n"; next MAINLOOP; } } } $SIG{'ALRM'} = 'DEFAULT'; print $OUT "\nAll tests succeeded. " . "Your regular expression works as required"; if (defined @timecount && defined $timecount[$en] && !$notime) { my($mre); if (defined $answers[$en][0]) { my($first) = (defined $answers[$en][1])? " first" : ""; print $OUT ", but is it\n" . "efficient? Timing your regex and the$first suggested answer:\n"; eval { $mre = qr"$answers[$en][0]" }; die "** Internal error: syntax error in answer pattern: $@\n" if $@; } else { print $OUT ".\nTiming your regex:\n"; } time_it($re, $mre, $timecount[$en], @y); time_it($re, $mre, $timecount[$en], @n); } else { print $OUT ".\n"; } return; } } ################################################## # Main Program # ################################################## $prompts_required = 2; $epn = 0; initialize_data(); printf $OUT "You are using Perl $]\n"; printf $OUT $intro, $ecount; $eno = 0; for(;;) { print $OUT "\n"; last if !defined ($_ = $term->readline(sprintf($prompt[$epn], $eno))); if (/^\?$/) { for ($k = 0; $k < @title; $k++) { printf $OUT "%2d %s\n", $k, $title[$k]; } next; } if (!/^\s*$/) { if (!/^(\d+)$/) { print $OUT "$_ is not a number\n"; next; } if ($1 > $ecount - 1) { print $OUT "$_ is too big\n"; next; } $eno = $1; } exercise($eno); if ($eno < $ecount - 1) { $eno++; $epn = 0; } else { print $OUT "\nYou have just run the last exercise.\n"; $epn = 1; } } print $OUT "\nGoodbye\n\n"; # The data for the various tests follows below. Lines that are data to be # matched (YES and NO lines) are Perl-expanded, and so may contained backslash # escapes such as \n or \xff. None of the other lines are so expanded; in # particular, the ANSWERS lines are not. __END__ . >>INTRO>> ---- Welcome to Philip Hazel's Regular Expression Exercises ---- There are currently %d exercises for you to try. You can work through them in order, or jump around as you please. All the input lines that you type are saved in a history list and can be recalled by means of the up and down arrow keys. You can edit input lines using the left and right arrow keys. Use ^D (Control+D) to abandon an individual test, and at the outer level to terminate the whole program. Enter ? at the main prompt to obtain a list of exercises. When entering a regex, do NOT enclose it in delimiters. If you need to set any options, use the internal option-setting sequences, for example (?i) to set case-independence. If you enter ? at a regex prompt, you will be shown the built-in suggested answer(s) to the exercise. If you enter ? followed by a number, the suggested answer of that number will be used to run the test, and will be available on the history list for re-use or modification. In many cases there is only one suggested answer, accessible with ?1. Tests that are normally timed skip the timing when ?1 is used. If you enter a line starting with an asterisk, the rest of the line must be a file name. The entire contents of the file, with trailing whitespace removed, are taken as the regex. There are several ways of solving each of these problems. The matching process is timed for some of them to give you a feel for the performance trade-offs. Some of the exercises are surprisingly tricky; the idea is to try to get you used to the Regex Way Of Thinking. :-) . .......................................................................... . >>PROMPTS>> Enter an exercise number (just press RETURN for the next one) [%d]: Enter a previous exercise number or ^D to exit the program [%d]: . .......................................................................... . >>EX Test the exercise mechanism>> This test is relatively simple, and is provided so that you can get used to the way this exerciser works. Write a regex to match the phrase "regular expression" without regard to the case of letters, allowing for the plural. Try making some mistakes to see how the exerciser handles them. . >>YES>> +0 regular expression What is a regular expression? +0 Regular Expressions Regular Expressions are very useful. +0 REGULAR EXPRESSION A CAPITAL REGULAR EXPRESSION . >>NO>> There is no matching text in this line. I don't know if there exists an irregular expression! Is this an exercise in regular expressionism? >>ANSWERS>> (?i)\bregular expressions?\b . .......................................................................... . >>EX Match at the start of a string>> Match a string that starts with one of the words "Their" or "There". Your answer will be timed. . >>YES>> +0 There There we go +0 Their Their time has come >>NO>> Therefore should not match We don't go there today. >>ANSWERS>> ^The(ir|re)\b >>TIME 10000>> . .......................................................................... . >>EX Match at the end of a string>> The test strings are Internet domain names like cam.ac.uk. Write a regex to match if the string ends with .uk or .gb. . >>YES>> cam.ac.uk iso.name.is.gb >>NO>> example.uk.fi this.is.a.very.long.name.that.does.not.end.with.one.of.the.requested.values >>ANSWERS>> (?>.*)(?<=\.(?:uk|gb)) >>TIME 10000>> . .......................................................................... . >>EX Match at the start of a line>> The test strings contain embedded newlines, indicated by "\n" in the output. Match one of the words "Their" or "There" at the start of a line. . >>YES>> +0 There There we go +0 Their Several lines.\nTheir boundaries\nare newline characters. >>NO>> One line. There should be no match. >>ANSWERS>> (?m)^The(ir|re)\b (^|(?<=\n))The(ir|re)\b >>TIME 10000>> . .......................................................................... . >>EX Matching binary data>> The test strings contain binary data; non-printing bytes are shown in hex as as "\xdd" in the output. Match all the data from the first binary zero character to the end of the string. . >>YES>> +0 \00\03\04 \01\02\00\03\04 +0 \00\03\04\012\05\06 \01\02\00\03\04\012\05\06 +0 \00\01\012\00\03 \00\01\012\00\03 >>ANSWERS>> (?s)\x00(.*) . .......................................................................... . >>EX Dismantling a file name>> The test strings are Unix file names like /usr/bin/perl. Write a regex to match such that the final component of the name ("perl" in the example above) ends up in the first captured substring ($1). . >>YES>> +1 perl /usr/bin/perl +1 name /this/is/a/very/very/very/long/file/name >>ANSWERS>> ^.*/([^/]*$) >>TIME 10000>> . .......................................................................... . >>EX Words containing three vowels in succession>> You will need to run this exercise twice. Try "[aeiou]" and "(a|e|i|o|u)" in two different patterns to find a word (in upper or lower case) that contains three vowels in succession. Your answers will be timed so you can see the speed difference between a character class and using single-character alternatives. . >>YES>> +0 various There are various words that match. +0 IOUs IOUs are promissory notes. +0 Beau Beau Geste +0 beautiful Oh! What a beautiful morning! +0 Eau Eau de Cologne >>NO>> There are plenty of words that do not match. >>ANSWERS>> (?i)\b[a-z]*[aeiou]{3}[a-z]*\b (?i)\b[a-z]*(?:a|e|i|o|u){3}[a-z]*\b >>TIME 10000>> . .......................................................................... . >>EX Words not containing any vowels>> Match a word that does not contain any vowels, where words contain letters (in either case) and hyphens, and are separated by spaces with normal punctuation (this is trickier than it sounds). . >>YES>> +0 rhythm This is the rhythm of life. +0 Rhythm Rhythm is what it is all about. +0 rhythms Music has rhythms! +0 by-by Bye-bye is not spelt by-by. >>NO>> Every word here contains a vowel. A yo-yo is a toy. The time is 23h00. >>ANSWERS>> (?i)(^|(?<=\s))[b-df-hj-np-tv-z-]+($|(?=[\s.,:;!?])) . .......................................................................... . >>EX Matched a quoted substring>> Write a regex to match a part of the subject string that is in either single or double quotes. You must allow for backslashed quotes within the quoted string. . >>YES>> +0 "Easy peasy" "Easy peasy" +0 'I don\\'t want this' He said 'I don\\'t want this'. . >>NO>> Mustn't match this one but we make it rather long to check for big backtracks. Don't match misquotes" at all. "A never-ending quote... >>ANSWERS>> (["'])((?>[^\\"']+)|\\.)*\1 . .......................................................................... . >>EX Match a missing newline>> Write a regex to match when the subject string does not end with a newline character. . >>YES>> The quick brown fox. Newlines in the middle \n are not relevant. >>NO>> End with newline\n . >>ANSWERS>> ^(?s)(?>.*)(?>TIME 10000>> . .......................................................................... . >>EX Match an HTML tag>> The test strings are normal text containing an HTML tag; the exercise is to match the tag. Parameters may have quoted values, using either single or double quotes, but one kind of quoted value may not contain the other. . >>YES>> Some text and a tag. What about an tag? Some tags have parameters . Remember to handle quotes Single quotes Quotes containing angles Quotes containing single quotes >>NO>> There is no tag here: 45<50 (though it's bad HTML). >>ANSWERS>> (?x) < ( "(?>[^"]*)" | '(?>[^']*)' | (?>[^'">]+) )* > >>TIME 10000>> . .......................................................................... . >>EX A tricky capturing example>> Write a regex to match weekday names that appear in text in either full or abbreviated form (e.g. Monday or Fri). The first letter is always capitalized and the rest are in lower case. The three-letter abbreviation must end up in the first captured substring in all cases. For example, if "Wednesday" is found, the first captured substring should contain "Wed", but if "Fri" is found it should contain "Fri". This is a tricky problem. . >>YES>> +1 Mon 'Twas on a Monday morning... +1 Tue Pancakes on Shrove Tuesday! +1 Wed Wednesday's child is full of woe. +1 Thu Thursday next. +1 Fri CU on Fri OK? +1 Sat No, Sat! +1 Sun Sun could be something else... >>NO>> The cat sat on the mat. Writing thursday is incorrect. Wedding is not today. They wed at last. >>ANSWERS>> \b((?:Mon|Fri|Sun)(?=day\b|\b)|Tue(?=sday\b|\b)|Wed(?=nesday\b|\b)|Thu(?=rsday\b|\b)|Sat(?=urday\b|\b))(?>[^y]+y)?\b >>END>>