January, 2008

  • Eric White's Blog

    How to Extract Comments from Open XML Documents

    • 4 Comments

    This post is based on an interesting query - a user of Open XML wanted a general way to extract the comments from Open XML documents and save them in a common metadata server. This post contains a short example that iterates through all files in a directory, and extracts all comments from them, and outputs some XML containing the comments. The directory can contain all types of Open XML documents: WordprocessingML, SpreadsheetML, and PresentationML documents.

    This blog is inactive.
    New blog: EricWhite.com/blog

    Blog TOC
    The resulting XML will look something like this:

    <Root>
      <Comment Source="docx" Author="Eric White">
        <Text space="preserve">Comment 1</Text>
      </Comment>
      <Comment Source="docx" Author="Eric White">
        <Text space="preserve">Another comment in a word doc.</Text>
      </Comment>
      <Comment Source="xlsx" Author="Eric White">
        <Text space="preserve">Eric White:
    This is a comment in an Excel spreadsheet.</Text>
      </Comment>
      <Comment Source="xlsx" Author="Eric White">
        <Text space="preserve">Eric White:
    Another comment.</Text>
      </Comment>
      <Comment Source="pptx" Author="Eric White">
        <Text space="preserve">Another comment.</Text>
      </Comment>
      <Comment Source="pptx" Author="Eric White">
        <Text space="preserve">This is a PPT comment.</Text>
      </Comment>
    </Root>
     

    Following is the example, in its entirety. This example was interesting, in that I originally write it using queries instead of the for loops in the extension methods, and in this case, I think that the code is more readable using an imperative style of coding.

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.IO;
    using System.Xml;
    using System.Xml.Linq;
    using Microsoft.Office.DocumentFormat.OpenXml.Packaging;
    namespace LtxOpenXml
    {
        public static class Extensions
        {
            public static XDocument LoadXDocument(this OpenXmlPart part)
            {
                XDocument xdoc;
                using (StreamReader streamReader = new StreamReader(part.GetStream()))
                    xdoc = XDocument.Load(XmlReader.Create(streamReader));
                return xdoc;
            }
            public static string StringConcatenate(this IEnumerable<string> source)
            {
                StringBuilder sb = new StringBuilder();
                foreach (string s in source)
                    sb.Append(s);
                return sb.ToString();
            }
        }
        class Program
        {
            static IEnumerable<XElement> ExtractFromDocx(string filename)
            {
                using (WordprocessingDocument wordDoc = WordprocessingDocument.Open(filename, true))
                {
                    MainDocumentPart mainPart = wordDoc.MainDocumentPart;
                    CommentsPart commentsPart = mainPart.CommentsPart;
                    XDocument cDoc = commentsPart.LoadXDocument();
                    XNamespace w = "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
                    foreach (var c in cDoc.Root.Elements(w + "comment"))
                    {
                        yield return new XElement("Comment",
                            new XAttribute("Source", "docx"),
                            new XAttribute("Author", (string)c.Attribute(w + "author")),
                            new XElement("Text",
                                new XAttribute("space", "preserve"),
                                c.Descendants(w + "t").Select(t => (string)t).StringConcatenate())
                        );
                    }
                }
            }
            static IEnumerable<XElement> ExtractFromXlsx(string filename)
            {
                using (SpreadsheetDocument spreadDoc = SpreadsheetDocument.Open(filename, true))
                {
                    XNamespace s = "http://schemas.openxmlformats.org/spreadsheetml/2006/main";
                    foreach (var wsp in spreadDoc.WorkbookPart.WorksheetParts)
                    {
                        var xwp = wsp.WorksheetCommentsPart;
                        if (xwp != null)
                        {
                            XDocument xd = xwp.LoadXDocument();
                            var authorArray = xd
                                              .Root
                                              .Element(s + "authors")
                                              .Elements(s + "author")
                                              .Select(c => (string)c).ToArray();
                            foreach (var c in xd.Root.Element(s + "commentList").Elements(s + "comment"))
                            {
                                yield return new XElement("Comment",
                                    new XAttribute("Source", "xlsx"),
                                    new XAttribute("Author", authorArray[(int)c.Attribute("authorId")]),
                                    new XElement("Text",
                                        new XAttribute("space", "preserve"),
                                        c.Element(s + "text").Descendants(s + "t").Select(t => (string)t).StringConcatenate())
                                );
                            }
                        }
                    }
                }
            }
            static IEnumerable<XElement> ExtractFromPptx(string filename)
            {
                using (PresentationDocument pDoc = PresentationDocument.Open(filename, true))
                {
                    XNamespace p = "http://schemas.openxmlformats.org/presentationml/2006/main";
                    var cap = pDoc.PresentationPart.CommentAuthorsPart;
                    if (cap != null)
                    {
                        var capXDocument = cap.LoadXDocument();
                        foreach (var slide in pDoc.PresentationPart.SlideParts)
                        {
                            var cp = slide.SlideCommentsPart;
                            if (cp != null)
                            {
                                var cpXDocument = cp.LoadXDocument();
                                foreach (var c in cpXDocument.Root.Elements(p + "cm"))
                                {
                                    yield return new XElement("Comment",
                                        new XAttribute("Source", "pptx"),
                                        new XAttribute("Author", (string)capXDocument
                                                                 .Root
                                                                 .Elements(p + "cmAuthor")
                                                                 .Where(z => (string)z.Attribute("id") == (string)c.Attribute("authorId"))
                                                                 .FirstOrDefault().Attribute("name")
                                                                 ),
                                        new XElement("Text",
                                            new XAttribute("space", "preserve"),
                                            (string)c.Element(p + "text"))
                                    );
                                }
                            }
                        }
                    }
                }
            }
            static void Main(string[] args)
            {
                XElement root = new XElement("Root",
                    Directory.GetFiles(".", "*.docx").Select(f => ExtractFromDocx(f)),
                    Directory.GetFiles(".", "*.xlsx").Select(f => ExtractFromXlsx(f)),
                    Directory.GetFiles(".", "*.pptx").Select(f => ExtractFromPptx(f))
                );
                Console.WriteLine(root);
            }
        }
    }

Page 1 of 4 (4 items) 1234
Page 1 of 1 (4 items)