Blog Map
[Blog Map] This blog is inactive. New blog: EricWhite.com/blog
In two previous posts, I developed a somewhat involved query to search through a word processing document for style names and/or paragraph content. This is a query that I’m developing for the PowerTools for Open XML project. In those posts, as I evolved the query, I showed each iteration of it, highlighting the changes I made. This post continues modifying that query:
Hey, if anyone else has any additional suggestions for this query, I’m interested! J
Here is the evolved query:
using System;using System.Collections.Generic;using System.Linq;using System.Text;using System.Text.RegularExpressions;using System.IO;using System.Xml;using System.Xml.Linq;using DocumentFormat.OpenXml.Packaging; public static class LocalExtensions{ public static string StringConcatenate<T>(this IEnumerable<T> source, Func<T, string> func) { StringBuilder sb = new StringBuilder(); foreach (T item in source) sb.Append(func(item)); return sb.ToString(); } public static string StringConcatenate<T>(this IEnumerable<T> source, Func<T, string> func, string separator) { StringBuilder sb = new StringBuilder(); foreach (T item in source) sb.Append(func(item)).Append(separator); if (sb.Length > separator.Length) sb.Length -= separator.Length; return sb.ToString(); } public static XDocument GetXDocument(this OpenXmlPart part) { XDocument xdoc = part.Annotation<XDocument>(); if (xdoc != null) return xdoc; using (StreamReader sr = new StreamReader(part.GetStream())) using (XmlReader xr = XmlReader.Create(sr)) xdoc = XDocument.Load(xr); part.AddAnnotation(xdoc); return xdoc; }} public static class W{ public static XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"; public static XName style = w + "style"; public static XName type = w + "type"; public static XName styleId = w + "styleId"; public static XName name = w + "name"; public static XName val = w + "val"; public static XName basedOn = w + "basedOn"; public static XName r = w + "r"; public static XName ins = w + "ins"; // "default" is not a valid identifier, so must use _default public static XName _default = w + "default"; public static XName body = w + "body"; public static XName pPr = w + "pPr"; public static XName pStyle = w + "pStyle"; public static XName p = w + "p"; public static XName t = w + "t";} class Program{ static bool ContainsAnyStyles(IEnumerable<string> stylesToSearch, IEnumerable<string> searchStrings) { return stylesToSearch.Intersect(searchStrings).Any(); } static bool ContainsAnyContent(string stringToSearch, IEnumerable<string> searchStrings, IEnumerable<Regex> regularExpressions, bool isRegularExpression, bool caseInsensitive) { if (isRegularExpression) return regularExpressions.Any(r => r.IsMatch(stringToSearch)); else if (caseInsensitive) return searchStrings.Any(s => stringToSearch.ToLower().Contains(s)); else return searchStrings.Any(s => stringToSearch.Contains(s)); } static IEnumerable<string> GetAllStyleIdsAndNames(WordprocessingDocument doc, string styleId) { string localStyleId = styleId; yield return styleId; string styleNameForFirstStyle = (string)doc .MainDocumentPart .StyleDefinitionsPart .GetXDocument() .Root .Elements(W.style) .Where(e => (string)e.Attribute(W.type) == "paragraph" && (string)e.Attribute(W.styleId) == styleId) .Elements(W.name) .Attributes(W.val) .FirstOrDefault(); if (styleNameForFirstStyle != null) yield return styleNameForFirstStyle; while (true) { XElement style = doc .MainDocumentPart .StyleDefinitionsPart .GetXDocument() .Root .Elements(W.style) .Where(e => (string)e.Attribute(W.type) == "paragraph" && (string)e.Attribute(W.styleId) == localStyleId) .FirstOrDefault(); if (style == null) yield break; var basedOn = (string)style .Elements(W.basedOn) .Attributes(W.val) .FirstOrDefault(); if (basedOn == null) yield break; yield return basedOn; XElement basedOnStyle = doc .MainDocumentPart .StyleDefinitionsPart .GetXDocument() .Root .Elements(W.style) .Where(e => (string)e.Attribute(W.type) == "paragraph" && (string)e.Attribute(W.styleId) == basedOn) .FirstOrDefault(); string basedOnStyleName = (string)basedOnStyle .Elements(W.name) .Attributes(W.val) .FirstOrDefault(); if (basedOnStyleName != null) yield return basedOnStyleName; localStyleId = basedOn; } } static int[] SearchInDocument(WordprocessingDocument doc, IEnumerable<string> styleSearchString, IEnumerable<string> contentSearchString, bool isRegularExpression, bool caseInsensitive) { RegexOptions options; Regex[] regularExpressions = null; if (isRegularExpression && contentSearchString != null) { if (caseInsensitive) options = RegexOptions.IgnoreCase | RegexOptions.Compiled; else options = RegexOptions.Compiled; regularExpressions = contentSearchString .Select(s => new Regex(s, options)).ToArray(); } string[] contentSearchStringToUse = null; if (contentSearchString != null) { if (!isRegularExpression && caseInsensitive) contentSearchStringToUse = contentSearchString.Select(s => s.ToLower()).ToArray(); else contentSearchStringToUse = contentSearchString.ToArray(); } var defaultStyleName = (string)doc .MainDocumentPart .StyleDefinitionsPart .GetXDocument() .Root .Elements(W.style) .Where(style => (string)style.Attribute(W.type) == "paragraph" && (string)style.Attribute(W._default) == "1") .First() .Attribute(W.styleId); var q1 = doc .MainDocumentPart .GetXDocument() .Root .Element(W.body) .Elements() .Select((p, i) => { var styleNode = p .Elements(W.pPr) .Elements(W.pStyle) .FirstOrDefault(); var styleName = styleNode != null ? (string)styleNode.Attribute(W.val) : defaultStyleName; return new { Element = p, Index = i, StyleName = styleName }; } ); var q2 = q1 .Select(i => { string text = null; if (i.Element.Name == W.p) text = i.Element.Elements() .Where(z => z.Name == W.r || z.Name == W.ins) .Descendants(W.t) .StringConcatenate(element => (string)element); else text = i.Element .Descendants(W.p) .StringConcatenate(p => p .Elements() .Where(z => z.Name == W.r || z.Name == W.ins) .Descendants(W.t) .StringConcatenate(element => (string)element), Environment.NewLine ); return new { Element = i.Element, StyleName = i.StyleName, Index = i.Index, Text = text }; } ); var q3 = q2 .Select(i => new { Element = i.Element, StyleName = i.StyleName, Index = i.Index, Text = i.Text, InheritedStyles = GetAllStyleIdsAndNames(doc, i.StyleName).Distinct() } ); int[] q4 = null; if (styleSearchString != null) q4 = q3 .Where(i => ContainsAnyStyles(i.InheritedStyles, styleSearchString)) .Select(i => i.Index) .ToArray(); int[] q5 = null; if (contentSearchStringToUse != null) q5 = q3 .Where(i => ContainsAnyContent(i.Text, contentSearchStringToUse, regularExpressions, isRegularExpression, caseInsensitive)) .Select(i => i.Index) .ToArray(); int[] q6 = null; if (q4 != null && q5 != null) q6 = q4.Intersect(q5).ToArray(); else q6 = q5 != null ? q5 : q4; return q6; } static int[] SearchInDocument(string filename, IEnumerable<string> styleSearchString, IEnumerable<string> contentSearchString, bool isRegularExpression, bool caseInsensitive) { using (WordprocessingDocument doc = WordprocessingDocument.Open(filename, false)) return SearchInDocument(doc, styleSearchString, contentSearchString, isRegularExpression, caseInsensitive); } static int[] SearchInDocument(string filename, string styleSearchString, string contentSearchString, bool isRegularExpression, bool caseInsensitive) { return SearchInDocument(filename, styleSearchString != null ? new List<string>() { styleSearchString } : null, contentSearchString != null ? new List<string>() { contentSearchString } : null, isRegularExpression, caseInsensitive); } static void Main(string[] args) { Console.WriteLine("Test 1"); int[] results1 = SearchInDocument( "Test.docx", new[] { "Normal" }, new[] { "h.*o", "aaa" }, true, false); foreach (var i in results1) Console.WriteLine(i); Console.WriteLine(results1.SequenceEqual(new[] { 7, 10 }) ? "Passed" : "Failed"); Console.WriteLine(); Console.WriteLine("Test 2"); int[] results2 = SearchInDocument( "Test.docx", new[] { "NotAStyle" }, new[] { "h.*o", "aaa" }, true, false); foreach (var i in results2) Console.WriteLine(i); Console.WriteLine(results2.SequenceEqual(new int[] { }) ? "Passed" : "Failed"); Console.WriteLine(); Console.WriteLine("Test 3"); int[] results3 = SearchInDocument( "Test.docx", new[] { "Heading1" }, null, true, false); foreach (var i in results3) Console.WriteLine(i); Console.WriteLine(results3.SequenceEqual(new int[] { 0 }) ? "Passed" : "Failed"); Console.WriteLine(); Console.WriteLine("Test 4"); int[] results4 = SearchInDocument( "Test.docx", new[] { "Normal" }, new[] { "h.*o", "aaa" }, true, true); foreach (var i in results4) Console.WriteLine(i); Console.WriteLine( results4.SequenceEqual(new int[] { 0, 6, 7, 8, 10 }) ? "Passed" : "Failed"); Console.WriteLine(); Console.WriteLine("Test 5"); int[] results5 = SearchInDocument( "Test.docx", null, new[] { "hello", "aaa" }, false, false); foreach (var i in results5) Console.WriteLine(i); Console.WriteLine( results5.SequenceEqual(new int[] { 7, 10 }) ? "Passed" : "Failed"); Console.WriteLine(); Console.WriteLine("Test 6"); int[] results6 = SearchInDocument( "Test.docx", null, new[] { "hello", "aaa" }, false, true); foreach (var i in results6) Console.WriteLine(i); Console.WriteLine( results6.SequenceEqual(new int[] { 0, 6, 7, 8, 10 }) ? "Passed" : "Failed"); Console.WriteLine(); Console.WriteLine("Test 7"); int[] results7 = SearchInDocument("Test.docx", "Heading1", "Aaa", false, false); foreach (var i in results7) Console.WriteLine(i); Console.WriteLine(results7.SequenceEqual(new int[] { 0 }) ? "Passed" : "Failed"); Console.WriteLine(); }}
Code is attached.
"...did a minor refactoring to pre-convert search strings to lower case when the search is case insensitive..."
For best practice, shouldn't that be "upper case"?
http://msdn.microsoft.com/en-us/library/bb386042.aspx
"Strings should be normalized to uppercase. There is a small group of characters that when converted to lowercase cannot make a round trip."
Great point, John. You're right, upper case is better.
-Eric