Several years back I posted a XML tokenizer for syntax highlighting. At the time I didn't post a complete app, since it was part of a larger project; if someone recently asked me for a little more context for the tokenizer so I put together a quickie Silverlight project showing how to use the tokenizer for syntax highlighting:
<UserControl x:Class="XmlNotepad.MainPage" xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation" xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml" xmlns:d="http://schemas.microsoft.com/expression/blend/2008" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" mc:Ignorable="d" d:DesignWidth="640" d:DesignHeight="480"> <Grid x:Name="LayoutRoot" Background="AliceBlue"> <Grid.RowDefinitions> <RowDefinition Height="*"/> <RowDefinition Height="*"/> </Grid.RowDefinitions> <TextBox x:Name="box" AcceptsReturn="True" Grid.Row="0"/> <TextBlock x:Name="block" Grid.Row="1"/> </Grid></UserControl>
using System;using System.Collections.Generic;using System.Linq;using System.Net;using System.Windows;using System.Windows.Controls;using System.Windows.Documents;using System.Windows.Input;using System.Windows.Media;using System.Windows.Media.Animation;using System.Windows.Shapes;
namespace XmlNotepad { public partial class MainPage : UserControl { public MainPage() { InitializeComponent(); box.TextChanged += new TextChangedEventHandler(box_TextChanged); }
// for xaml private static Color ColorForToken(XmlToken token, string tokenText) { Color color = Colors.Black; switch (token.Kind) { case XmlTokenKind.Open: case XmlTokenKind.OpenClose: case XmlTokenKind.Close: case XmlTokenKind.SelfClose: case XmlTokenKind.CommentBegin: case XmlTokenKind.CommentEnd: case XmlTokenKind.CDataBegin: case XmlTokenKind.CDataEnd: case XmlTokenKind.Equals: case XmlTokenKind.OpenProcessingInstruction: case XmlTokenKind.CloseProcessingInstruction: case XmlTokenKind.AttributeValue: color = Colors.Blue; break; case XmlTokenKind.ElementName: color = Colors.Brown; break; case XmlTokenKind.TextContent: color = Colors.Black; break; case XmlTokenKind.AttributeName: case XmlTokenKind.Entity: color = Colors.Red; break; case XmlTokenKind.CommentText: color = Colors.Green; break; }
return color; }
void box_TextChanged(object sender, TextChangedEventArgs e) { block.Inlines.Clear(); XmlTokenizer tokenizer = new XmlTokenizer(); XmlTokenizerMode mode = XmlTokenizerMode.OutsideElement; string xml = box.Text;
List<XmlToken> tokens = tokenizer.Tokenize(xml, ref mode); List<string> tokenTexts = new List<string>(tokens.Count); List<Color> colors = new List<Color>(tokens.Count); int position = 0; foreach (XmlToken token in tokens) { string tokenText = xml.Substring(position, token.Length); tokenTexts.Add(tokenText); Color color = ColorForToken(token, tokenText); colors.Add(color); position += token.Length; } for (int i = 0; i < tokens.Count; i++) { Run run = new Run(); run.Foreground = new SolidColorBrush(colors[i]); run.Text = tokenTexts[i]; block.Inlines.Add(run); } } }}
using System;using System.Collections.Generic;using System.Text;using System.Diagnostics;
namespace XmlNotepad{ /* * this file implements a mostly correct XML tokenizer. The token boundaries * have been chosen to match Visual Studio syntax highlighting, so a few of * the boundaries are little weird. (Especially comments) known issues: * * Doesn't handle DTD's * mediocre handling of processing instructions <? ?> -- it won't crash, * but the token boundaries are wrong * Doesn't enforce correct XML * there's a few cases where it will die if given invalid XML * * * This tokenizer has been designed to be restartable, so you can tokenize * one line of XML at a time. */ //enum TokenColors //{ // Punctuation, StringLiteral, ElementName, AttributeName, Comment, Normal //}
enum XmlTokenKind : short { Open, // < Close,//> SelfClose,// /> OpenClose,// </ ElementName, ElementWhitespace,//whitespace between attributes AttributeName, Equals, // inside attribute AttributeValue, // attribute value CommentBegin, // <!-- CommentText, CommentEnd, // --> Entity, // > OpenProcessingInstruction, // <? CloseProcessingInstruction, // ?> CDataBegin, // <![CDATA[ CDataEnd,// ]]> TextContent, //WhitespaceContent, // text content that's whitespace. Space is embedded inside EOF, // end of file }
// Used so you can restart the tokenizer for the next line of XML enum XmlTokenizerMode { InsideComment, InsideProcessingInstruction, AfterOpen, AfterAttributeName, AfterAttributeEquals, InsideElement, // after element name, before attribute or /> OutsideElement, InsideCData, }
struct XmlToken { public XmlTokenKind Kind; public short Length; public XmlToken(XmlTokenKind kind, int length) { Kind = kind; Length = (short)length; } }
// XML tokenizer, tokens are designed to match Visual Studio syntax highlighting class XmlTokenizer { string input; int position = 0; XmlTokenizerMode mode = XmlTokenizerMode.OutsideElement;
public static List<XmlToken> Tokenize(string input) { XmlTokenizerMode mode = XmlTokenizerMode.OutsideElement; XmlTokenizer tokenizer = new XmlTokenizer(); return tokenizer.Tokenize(input, ref mode); }
public List<XmlToken> Tokenize(string input, ref XmlTokenizerMode _mode) { this.input = input; this.mode = _mode; this.position = 0; List<XmlToken> result = Tokenize(); _mode = this.mode; return result; }
private List<XmlToken> Tokenize() { List<XmlToken> list = new List<XmlToken>(); XmlToken token; do { int previousPosition = position; token = NextToken(); string tokenText = input.Substring(previousPosition, token.Length); list.Add(token); } while (token.Kind != XmlTokenKind.EOF);
List<string> strings = TokensToStrings(list, input);
return list; }
private List<string> TokensToStrings(List<XmlToken> list, string input) { List<string> output = new List<string>(); int position = 0; foreach (XmlToken token in list) { output.Add(input.Substring(position, token.Length)); position += token.Length; } return output; }
// debugging function public string RemainingText { get { return input.Substring(position); } }
private XmlToken NextToken() { if (position >= input.Length) return new XmlToken(XmlTokenKind.EOF, 0);
XmlToken token; switch (mode) { case XmlTokenizerMode.AfterAttributeEquals: token = TokenizeAttributeValue(); break; case XmlTokenizerMode.AfterAttributeName: token = TokenizeSimple("=", XmlTokenKind.Equals, XmlTokenizerMode.AfterAttributeEquals); break; case XmlTokenizerMode.AfterOpen: token = TokenizeName(XmlTokenKind.ElementName, XmlTokenizerMode.InsideElement); break; case XmlTokenizerMode.InsideCData: token = TokenizeInsideCData(); break; case XmlTokenizerMode.InsideComment: token = TokenizeInsideComment(); break; case XmlTokenizerMode.InsideElement: token = TokenizeInsideElement(); break; case XmlTokenizerMode.InsideProcessingInstruction: token = TokenizeInsideProcessingInstruction(); break; case XmlTokenizerMode.OutsideElement: token = TokenizeOutsideElement(); break; default: token = new XmlToken(XmlTokenKind.EOF, 0); throw new Exception ("missing case"); break; } return token; }
private bool IsNameCharacter(char character) { // XML rule: Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender bool result = char.IsLetterOrDigit(character) || character == '.' | character == '-' | character == '_' | character == ':'; return result; }
private XmlToken TokenizeAttributeValue() { Debug.Assert(mode == XmlTokenizerMode.AfterAttributeEquals); int closePosition = input.IndexOf(input[position], position + 1); XmlToken token = new XmlToken(XmlTokenKind.AttributeValue, closePosition + 1 - position); position = closePosition + 1; mode = XmlTokenizerMode.InsideElement; return token; }
private XmlToken TokenizeName(XmlTokenKind kind, XmlTokenizerMode nextMode) { Debug.Assert(mode == XmlTokenizerMode.AfterOpen || mode == XmlTokenizerMode.InsideElement); int i; for (i = position; i < input.Length; i++) { if (!IsNameCharacter(input[i])) { break; } } XmlToken token = new XmlToken(kind, i - position); mode = nextMode; position = i; return token; }
private XmlToken TokenizeElementWhitespace() { int i; for (i = position; i < input.Length; i++) { if (!char.IsWhiteSpace(input[i])) { break; } } XmlToken token = new XmlToken(XmlTokenKind.ElementWhitespace, i - position); position = i; return token; }
private bool StartsWith(string text) { if (position + text.Length > input.Length) return false; else return input.Substring(position, text.Length) == text; }
private XmlToken TokenizeInsideElement() { if (char.IsWhiteSpace(input[position])) return TokenizeElementWhitespace(); else if (StartsWith("/>")) return TokenizeSimple("/>", XmlTokenKind.SelfClose, XmlTokenizerMode.OutsideElement); else if (StartsWith(">")) return TokenizeSimple(">", XmlTokenKind.Close, XmlTokenizerMode.OutsideElement); else { return TokenizeName(XmlTokenKind.AttributeName, XmlTokenizerMode.AfterAttributeName); } }
//// We break on newlines because that makes it easier for us //// to ignore the space after comments //private Token TokenizeWhitespaceContent() //{ // Debug.Assert(char.IsWhiteSpace(input[position])); // bool sawNewline = false; // int i; // for (i = position; i < input.Length; i++) { // if (!char.IsWhiteSpace(input[i])) { // break; // } else if (input[i] == '\n' || input[i] == '\r') { // sawNewline = true; // } else if (sawNewline) { // break; // } // } // Token token = new Token(TokenKind.WhitespaceContent, i - position); // position = i; // return token; //}
private XmlToken TokenizeText() { Debug.Assert(input[position] != '<'); Debug.Assert(input[position] != '&'); Debug.Assert(mode == XmlTokenizerMode.OutsideElement); int i; for (i = position; i < input.Length; i++) { if (input[i] == '<' || input[i] == '&') { break; } } XmlToken token = new XmlToken(XmlTokenKind.TextContent, i - position); position = i; return token; }
private XmlToken TokenizeOutsideElement() { Debug.Assert(mode == XmlTokenizerMode.OutsideElement); if (position >= input.Length) return new XmlToken(XmlTokenKind.EOF, 0);
switch (input[position]) { case '<': return TokenizeOpen(); case '&': return TokenizeEntity(); default: return TokenizeText(); } }
private XmlToken TokenizeSimple(string text, XmlTokenKind kind, XmlTokenizerMode nextMode) { XmlToken token = new XmlToken(kind, text.Length); position += text.Length; mode = nextMode; return token; }
private XmlToken TokenizeOpen() { Debug.Assert(input[position] == '<'); if (StartsWith("<!--")) { return TokenizeSimple("<!--", XmlTokenKind.CommentBegin, XmlTokenizerMode.InsideComment); } else if (StartsWith("<![CDATA[")) { return TokenizeSimple("<![CDATA[", XmlTokenKind.CDataBegin, XmlTokenizerMode.InsideCData); } else if (StartsWith("<?")) { return TokenizeSimple("<?", XmlTokenKind.OpenProcessingInstruction, XmlTokenizerMode.InsideProcessingInstruction); } else if (StartsWith("</")) { return TokenizeSimple("</", XmlTokenKind.OpenClose, XmlTokenizerMode.AfterOpen); } else { return TokenizeSimple("<", XmlTokenKind.Open, XmlTokenizerMode.AfterOpen); } }
private XmlToken TokenizeEntity() { Debug.Assert(mode == XmlTokenizerMode.OutsideElement); Debug.Assert(input[position] == '&'); XmlToken token = new XmlToken(XmlTokenKind.Entity, input.IndexOf(';', position) - position); position += token.Length; return token; }
private XmlToken TokenizeInsideProcessingInstruction() { Debug.Assert(mode == XmlTokenizerMode.InsideProcessingInstruction); int tokenend = input.IndexOf("?>", position); if (position == tokenend) { position += "?>".Length; mode = XmlTokenizerMode.OutsideElement; return new XmlToken(XmlTokenKind.CloseProcessingInstruction, "?>".Length); } else { XmlToken token = new XmlToken(XmlTokenKind.TextContent, tokenend - position); position = tokenend; return token; } }
private XmlToken TokenizeInsideCData() { Debug.Assert(mode == XmlTokenizerMode.InsideCData); int tokenend = input.IndexOf("]]>", position); if (position == tokenend) { position += "]]>".Length; mode = XmlTokenizerMode.OutsideElement; return new XmlToken(XmlTokenKind.CDataEnd, "]]>".Length); } else { XmlToken token = new XmlToken(XmlTokenKind.TextContent, tokenend - position); position = tokenend; return token; } }
private XmlToken TokenizeInsideComment() { Debug.Assert(mode == XmlTokenizerMode.InsideComment); int tokenend = input.IndexOf("-->", position); if (position == tokenend) { position += "-->".Length; mode = XmlTokenizerMode.OutsideElement; return new XmlToken(XmlTokenKind.CommentEnd, "-->".Length); } else { XmlToken token = new XmlToken(XmlTokenKind.CommentText, tokenend - position); position = tokenend; return token; } } }}
If you're doing an interactive editor, you'll want to fix a few straightforward bugs in the tokenizer for handling invalid XML. Enjoy!