Code Listings
Chapter 24: Regular Expressions
The Regular Expression samples are also preloaded into the LINQPad utility.
Regular expression basics:
Console.WriteLine (Regex.Match ("color", @"colou?r").Success); // True Console.WriteLine (Regex.Match ("colour", @"colou?r").Success); // True Console.WriteLine (Regex.Match ("colouur", @"colou?r").Success); // False Match m = Regex.Match ("any colour you like", @"colou?r"); Console.WriteLine (m.Success); // True Console.WriteLine (m.Index); // 4 Console.WriteLine (m.Length); // 6 Console.WriteLine (m.Value); // colour Console.WriteLine (m.ToString()); // colour
Match m1 = Regex.Match ("One color? There are two colours in my head!", @"colou?rs?"); Match m2 = m1.NextMatch(); Console.WriteLine (m1); // color Console.WriteLine (m2); // colours
foreach (Match m in Regex.Matches ("One color? There are two colours in my head!", @"colou?rs?")) Console.WriteLine (m);
Console.WriteLine (Regex.IsMatch ("Jenny", "Jen(ny|nifer)?")); // True
Compiled regular expressions:
Regex r = new Regex (@"sausages?"); Console.WriteLine (r.Match ("sausage")); // sausage Console.WriteLine (r.Match ("sausages")); // sausages
Regex options:
Console.WriteLine (Regex.Match ("a", "A", RegexOptions.IgnoreCase)); // a Console.WriteLine (Regex.Match ("a", @"(?i)A")); // a Console.WriteLine (Regex.Match ("AAAa", @"(?i)a(?-i)a")); // Aa
Character escapes:
Console.WriteLine (Regex.Match ("what?", @"what\?")); // what? (correct) Console.WriteLine (Regex.Match ("what?", @"what?")); // what (incorrect) Console.WriteLine (Regex.Escape (@"?")); // \? Console.WriteLine (Regex.Unescape (@"\?")); // ? Console.WriteLine (Regex.Match ("\\", "\\\\")); // \ Console.WriteLine (Regex.IsMatch ("hello world", @"hello world")); // True
Character sets:
Console.Write (Regex.Matches ("That is that.", "[Tt]hat").Count); // 2 Console.Write (Regex.Match ("quiz qwerty", "q[^aeiou]").Index); // 5 Console.Write (Regex.Match ("b1-c4", @"[a-h]\d-[a-h]\d").Success); // True Console.Write (Regex.IsMatch ("Yes, please", @"\p{P}")); // True
Quantifiers:
Console.Write (Regex.Match ("cv15.doc", @"cv\d*\.doc").Success); // True Console.Write (Regex.Match ("cvjoint.doc", @"cv.*\.doc").Success); // True Console.Write (Regex.Matches ("slow! yeah slooow!", "slo+w").Count); // 2
Regex bp = new Regex (@"\d{2,3}/\d{2,3}"); Console.WriteLine (bp.Match ("It used to be 160/110")); // 160/110 Console.WriteLine (bp.Match ("Now it's only 115/75")); // 115/75
Greedy vs. lazy quantifiers:
string html = "<i>By default</i> quantifiers are <i>greedy</i> creatures"; foreach (Match m in Regex.Matches (html, @"<i>.*</i>")) Console.WriteLine (m);
foreach (Match m in Regex.Matches (html, @"<i>.*?</i>")) Console.WriteLine (m);
Lookahead and Lookbehind:
Console.WriteLine (Regex.Match ("say 25 miles more", @"\d+\s(?=miles)")); Console.WriteLine (Regex.Match ("say 25 miles more", @"\d+\s(?=miles).*"));
string password = "..."; bool ok = Regex.IsMatch (password, @"(?=.*\d).{6,}");
string regex = "(?i)good(?!.*(however|but))"; Console.WriteLine (Regex.IsMatch ("Good work! But...", regex)); // False Console.WriteLine (Regex.IsMatch ("Good work! Thanks!", regex)); // True
string regex = "(?i)(?<!however.*)good"; Console.WriteLine (Regex.IsMatch ("However good, we...", regex)); // False Console.WriteLine (Regex.IsMatch ("Very good, thanks!" , regex)); // True
Anchors:
Console.WriteLine (Regex.Match ("Not now", "^[Nn]o")); // No Console.WriteLine (Regex.Match ("f = 0.2F", "[Ff]$")); // F
string fileNames = "a.txt" + "\r\n" + "b.doc" + "\r\n" + "c.txt"; string r = @".+\.txt(?=\r?$)"; foreach (Match m in Regex.Matches (fileNames, r, RegexOptions.Multiline)) Console.Write (m + " ");
MatchCollection emptyLines = Regex.Matches (s, "^(?=\r?$)", RegexOptions.Multiline); MatchCollection blankLines = Regex.Matches (s, "^[ \t]*(?=\r?$)", RegexOptions.Multiline);
Word boundaries:
foreach (Match m in Regex.Matches ("Wedding in Sarajevo", @"\b\w+\b")) Console.WriteLine (m); int one = Regex.Matches ("Wedding in Sarajevo", @"\bin\b").Count; // 1 int two = Regex.Matches ("Wedding in Sarajevo", @"in").Count; // 2 string text = "Don't loose (sic) your cool"; Console.Write (Regex.Match (text, @"\b\w+\b\s(?=\(sic\))")); // loose
Groups:
Match m = Regex.Match ("206-465-1918", @"(\d{3})-(\d{3}-\d{4})"); Console.WriteLine (m.Groups[1]); // 206 Console.WriteLine (m.Groups[2]); // 465-1918 Console.WriteLine (m.Groups[0]); // 206-465-1918 Console.WriteLine (m); // 206-465-1918
foreach (Match m in Regex.Matches ("pop pope peep", @"\b(\w)\w+\1\b")) Console.Write (m + " "); // pop peep
Named groups:
string regEx = @"\b" + // word boundary @"(?'letter'\w)" + // match first letter, and name it 'letter' @"\w+" + // match middle letters @"\k'letter'" + // match last letter, denoted by 'letter' @"\b"; // word boundary foreach (Match m in Regex.Matches ("bob pope peep", regEx)) Console.Write (m + " "); // bob peep
string regFind = @"<(?'tag'\w+?).*>" + // match first tag, and name it 'tag' @"(?'text'.*?)" + // match text content, name it 'text' @"</\k'tag'>"; // match last tag, denoted by 'tag' Match m = Regex.Match ("<h1>hello</h1>", regFind); Console.WriteLine (m.Groups ["tag"]); // h1 Console.WriteLine (m.Groups ["text"]); // hello
Replacing and splitting text:
string find = @"\bcat\b"; string replace = "dog"; Console.WriteLine (Regex.Replace ("catapult the cat", find, replace));
string text = "10 plus 20 makes 30"; Console.WriteLine (Regex.Replace (text, @"\d+", @"<$0>"));
string regFind = @"<(?'tag'\w+?).*>" + // match first tag, and name it 'tag' @"(?'text'.*?)" + // match text content, name it 'text' @"</\k'tag'>"; // match last tag, denoted by 'tag' string regReplace = @"<${tag}" + // <tag @" value=""" + // value=" @"${text}" + // text @"""/>"; // "/> Console.Write (Regex.Replace ("<msg>hello</msg>", regFind, regReplace));
MatchEvaluator delegate:
Console.WriteLine (Regex.Replace ("5 is less than 10", @"\d+", m => (int.Parse (m.Value) * 10).ToString()) );
Splitting text:
foreach (string s in Regex.Split ("a5b7c", @"\d")) Console.Write (s + " "); // a b c foreach (string s in Regex.Split ("oneTwoThree", @"(?=[A-Z])")) Console.Write (s + " "); // one Two Three
Regular Expressions Cookbook
Matching U.S. Social Security number/phone number:
string ssNum = @"\d{3}-\d{2}-\d{4}"; Console.WriteLine (Regex.IsMatch ("123-45-6789", ssNum)); // True string phone = @"(?x) ( \d{3}[-\s] | \(\d{3}\)\s? ) \d{3}[-\s]? \d{4}"; Console.WriteLine (Regex.IsMatch ("123-456-7890", phone)); // True Console.WriteLine (Regex.IsMatch ("(123) 456-7890", phone)); // True
Extracting “name = value” pairs (one per line):
string r = @"(?m)^\s*(?'name'\w+)\s*=\s*(?'value'.*)\s*(?=\r?$)"; string text = @"id = 3 secure = true timeout = 30"; foreach (Match m in Regex.Matches (text, r)) Console.WriteLine (m.Groups["name"] + " is " + m.Groups["value"]); id is 3 secure is true timeout is 30
Strong password validation:
string r = @"(?x)^(?=.* ( \d | \p{P} | \p{S} )).{6,}"; Console.WriteLine (Regex.IsMatch ("abc12", r)); // False Console.WriteLine (Regex.IsMatch ("abcdef", r)); // False Console.WriteLine (Regex.IsMatch ("ab88yz", r)); // True
Lines of at least 80 characters:
string r = @"(?m)^.{80,}(?=\r?$)"; string fifty = new string ('x', 50); string eighty = new string ('x', 80); string text = eighty + "\r\n" + fifty + "\r\n" + eighty; Console.WriteLine (Regex.Matches (text, r).Count); // 2
Parsing dates/times (N/N/N H:M:S AM/PM):
string r = @"(?x)(?i) (\d{1,4}) [./-] (\d{1,2}) [./-] (\d{1,4}) [\sT] (\d+):(\d+):(\d+) \s? (A\.?M\.?|P\.?M\.?)?"; string text = "01/02/2008 5:20:50 PM"; foreach (Group g in Regex.Match (text, r).Groups) Console.WriteLine (g.Value + " "); 01/02/2008 5:20:50 PM 01 02 2008 5 20 50 PM
Matching Roman numerals:
string r = @"(?i)\bm*" + @"(d?c{0,3}|c[dm])" + @"(l?x{0,3}|x[lc])" + @"(v?i{0,3}|i[vx])" + @"\b"; Console.WriteLine (Regex.IsMatch ("MCMLXXXIV", r)); // True
Removing repeated words:
string r = @"(?'dupe'\w+)\W\k'dupe'"; string text = "In the the beginning..."; Console.WriteLine (Regex.Replace (text, r, "${dupe}")); In the beginning
Word count:
string r = @"\b(\w|[-'])+\b"; string text = "It's all mumbo-jumbo to me"; Console.WriteLine (Regex.Matches (text, r).Count); // 5
Matching a Guid:
string r = @"(?i)\b" + @"[0-9a-fA-F]{8}\-" + @"[0-9a-fA-F]{4}\-" + @"[0-9a-fA-F]{4}\-" + @"[0-9a-fA-F]{4}\-" + @"[0-9a-fA-F]{12}" + @"\b"; string text = "Its key is {3F2504E0-4F89-11D3-9A0C-0305E82C3301}."; Console.WriteLine (Regex.Match (text, r).Index); // 12
Parsing an XML tag:
string r = @"<(?'tag'\w+?).*>" + // match first tag, and name it 'tag' @"(?'text'.*?)" + // match text content, name it 'text' @"</\k'tag'>"; // match last tag, denoted by 'tag' string text = "<h1>hello</h1>"; Match m = Regex.Match (text, r); Console.WriteLine (m.Groups ["tag"]); // h1 Console.WriteLine (m.Groups ["text"]); // hello
Splitting a camel-cased word:
string r = @"(?=[A-Z])"; foreach (string s in Regex.Split ("oneTwoThree", r)) Console.Write (s + " "); // one Two Three
Obtaining a legal filename:
string input = "My \"good\" <recipes>.txt"; char[] invalidChars = System.IO.Path.GetInvalidPathChars(); string invalidString = Regex.Escape (new string (invalidChars)); string valid = Regex.Replace (input, "[" + invalidString + "]", ""); Console.WriteLine (valid); My good recipes.txt
Escaping Unicode characters for HTML:
string htmlFragment = "© 2007"; string result = Regex.Replace (htmlFragment, @"[\u0080-\uFFFF]", m => @"&#" + ((int)m.Value[0]).ToString() + ";"); Console.WriteLine (result); // © 2007
© 2007, O'Reilly Media, Inc. All rights reserved