Blog - Title

Complete Listing of ParseWordML

Complete Listing of ParseWordML

  • Comments 0

[Blog Map]  [Table of Contents]  [Next Topic]

The following code is attached to this page.

This blog is inactive.
New blog: EricWhite.com/blog

Blog TOC
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Xml;
using System.Xml.Linq;
using DocumentFormat.OpenXml.Packaging;
 
public class GroupOfAdjacent<TSource, TKey> :
    IEnumerable<TSource>, IGrouping<TKey, TSource>
{
    public TKey Key { get; set; }
    private List<TSource> GroupList { get; set; }
 
    System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator()
    {
        return ((System.Collections.Generic.IEnumerable<TSource>)this).GetEnumerator();
    }
 
    System.Collections.Generic.IEnumerator<TSource>
        System.Collections.Generic.IEnumerable<TSource>.GetEnumerator()
    {
        foreach (var s in GroupList)
            yield return s;
    }
 
    public GroupOfAdjacent(List<TSource> source, TKey key)
    {
        GroupList = source;
        Key = key;
    }
}
 
public static class LocalExtensions
{
    public static string GetPath(this XElement el)
    {
        return
            el
            .AncestorsAndSelf()
            .Aggregate("", (seed, i) => i.Name.LocalName + "/" + seed);
    }
 
    public static string StringConcatenate(
        this IEnumerable<string> source)
    {
        return source.Aggregate(
            new StringBuilder(),
            (s, i) => s.Append(i),
            s => s.ToString());
    }
 
    public static string StringConcatenate<T>(
        this IEnumerable<T> source,
        Func<T, string> projectionFunc)
    {
        return source.Aggregate(
            new StringBuilder(),
            (s, i) => s.Append(projectionFunc(i)),
            s => s.ToString());
    }
 
    public static IEnumerable<IGrouping<TKey, TSource>> GroupAdjacent<TSource, TKey>(
        this IEnumerable<TSource> source,
        Func<TSource, TKey> keySelector)
    {
        TKey last = default(TKey);
        bool haveLast = false;
        List<TSource> list = new List<TSource>();
 
        foreach (TSource s in source)
        {
            TKey k = keySelector(s);
            if (haveLast)
            {
                if (!k.Equals(last))
                {
                    yield return new GroupOfAdjacent<TSource, TKey>(list, last);
                    list = new List<TSource>();
                    list.Add(s);
                    last = k;
                }
                else
                {
                    list.Add(s);
                    last = k;
                }
            }
            else
            {
                list.Add(s);
                last = k;
                haveLast = true;
            }
        }
        if (haveLast)
            yield return new GroupOfAdjacent<TSource, TKey>(list, last);
    }
}
 
class Program
{
    readonly static XNamespace w =
      "http://schemas.openxmlformats.org/wordprocessingml/2006/main";
 
    public static XDocument LoadXDocument(OpenXmlPart part)
    {
        XDocument xdoc;
        using (StreamReader streamReader = new StreamReader(part.GetStream()))
            xdoc = XDocument.Load(XmlReader.Create(streamReader));
        return xdoc;
    }
 
    public static string GetParagraphStyle(XElement para)
    {
        return (string)para.Elements(w + "pPr")
                           .Elements(w + "pStyle")
                           .Attributes(w + "val")
                           .FirstOrDefault();
    }
 
    public static string GetCommentText(XDocument commentsDoc, string id)
    {
        var commentNode =
            commentsDoc.Root
                       .Elements(w + "comment")
                       .Where(c => (string)c.Attribute(w + "id") == id)
                       .First();
 
        var comment =
            commentNode.Elements(w + "p")
                       .StringConcatenate(node =>
                           node.Descendants(w + "t")
                               .Select(t => (string)t)
                               .StringConcatenate()
                           + "\n");
        return comment;
    }
 
    static void Main(string[] args)
    {
        const string filename = "SampleDoc.docx";
 
        using (WordprocessingDocument wordDoc =
            WordprocessingDocument.Open(filename, true))
        {
            MainDocumentPart mainPart = wordDoc.MainDocumentPart;
            StyleDefinitionsPart stylePart = mainPart.StyleDefinitionsPart;
            CommentsPart commentsPart = mainPart.CommentsPart;
            XDocument mainPartDoc = LoadXDocument(mainPart);
            XDocument styleDoc = LoadXDocument(stylePart);
            XDocument commentsDoc = LoadXDocument(commentsPart);
 
            string defaultStyle =
                (string)styleDoc.Root
                                .Elements(w + "style")
                                .Where(style =>
                                    (string)style.Attribute(w + "type") == "paragraph" &&
                                    (string)style.Attribute(w + "default") == "1")
                                .First()
                                .Attribute(w + "styleId");
 
            var paragraphs =
                mainPartDoc.Root
                           .Element(w + "body")
                           .Descendants(w + "p")
                .Select(p =>
                {
                    string style = GetParagraphStyle(p);
                    string styleName = style == null ? defaultStyle : style;
                    return new
                    {
                        ParagraphNode = p,
                        Style = styleName
                    };
                }
                );
 
            XName r = w + "r";
            XName ins = w + "ins";
 
            var paragraphsWithText =
                paragraphs.Select(p =>
                    new
                    {
                        ParagraphNode = p.ParagraphNode,
                        Style = p.Style,
                        Text = p.ParagraphNode
                                .Elements()
                                .Where(z => z.Name == r || z.Name == ins)
                                .Descendants(w + "t")
                                .StringConcatenate(s => (string)s)
                    }
                );
 
            var groupedCodeParagraphs =
                paragraphsWithText.GroupAdjacent(p => p.Style)
                                  .Where(g => g.Key == "Code");
 
            var groupedCodeWithComments =
                groupedCodeParagraphs.Select(g =>
                    {
                        var id =
                            (string)g.Select(p => p.ParagraphNode)
                                     .Elements(w + "commentRangeStart")
                                     .First()
                                     .Attribute(w + "id");
                        return new
                        {
                            ParagraphGroup = g,
                            Comment = GetCommentText(commentsDoc, id)
                        };
                    }
                );
 
            foreach (var group in groupedCodeWithComments)
            {
                Console.WriteLine("Code Block");
                Console.WriteLine("==========");
                foreach (var paragraph in group.ParagraphGroup)
                    Console.WriteLine(paragraph.Text);
                Console.WriteLine();
                Console.WriteLine("Meta Data");
                Console.WriteLine("=========");
                Console.WriteLine(group.Comment);
                Console.WriteLine();
            }
        }
    }
}
 

[Blog Map]  [Table of Contents]  [Next Topic]

Attachment: ParseWordML.cs
Leave a Comment
  • Please add 2 and 1 and type the answer here:
  • Post