If you’ve ever spent a lot of time around the game of Chess you’ve probably seen or interacted with the PGN file format in one way or another. PGN stands for Portable Game Notation and is a concise way to represent Chess games.

Using regular expressions I’ve written a very lightweight parser to handle the file format. I wouldn’t be surprised if the regular expressions don’t capture every legal .pgn file, but it shows how effective regular expressions can be for semi-structured data.

Not being a Chess player myself, I’m not really sure what to do with:

"Nxf7 Rxe1+"

but perhaps I’ll look at that in a future blog post.

Edit: On second thought, there is all sorts of wrong with this parser. For example, notice how move ‘3’ is listed twice. The ‘3…’ continues on after the comment.

Alas, once the raw list of moves is parsed another step to coalesce them is required. In addition, a commenter pointed out that the 1/2-1/2 is the result of the game (a draw). I’ll spend a little more time on this before I can call it an official ‘parser’ but it’s a nice start :D. 

open System.Text.RegularExpressions

let pgnGameText = "
[Event \"F/S Return Match\"]
[Site \"Belgrade, Serbia Yugoslavia|JUG\"]
[Date \"1992.11.04\"]
[Round \"29\"]
[White \"Fischer, Robert J.\"]
[Black \"Spassky, Boris V.\"]
[Result \"1/2-1/2\"]
 
1. e4 e5 2. Nf3 Nc6 3. Bb5 {This opening is called the Ruy Lopez.} 3... a6
4. Ba4 Nf6 5. O-O Be7 6. Re1 b5 7. Bb3 d6 8. c3 O-O 9. h3 Nb8  10. d4 Nbd7
11. c4 c6 12. cxb5 axb5 13. Nc3 Bb7 14. Bg5 b4 15. Nb1 h6 16. Bh4 c5 17. dxe5
Nxe4 18. Bxe7 Qxe7 19. exd6 Qf6 20. Nbd2 Nxd6 21. Nc4 Nxc4 22. Bxc4 Nb6
23. Ne5 Rae8 24. Bxf7+ Rxf7 25. Nxf7 Rxe1+ 26. Qxe1 Kxf7 27. Qe3 Qg5 28. Qxg5
hxg5 29. b3 Ke6 30. a3 Kd6 31. axb4 cxb4 32. Ra5 Nd5 33. f3 Bc8 34. Kf2 Bf5
35. Ra7 g6 36. Ra6+ Kc5 37. Ke1 Nf4 38. g3 Nxh3 39. Kd2 Kb5 40. Rd6 Kc5 41. Ra6
Nf2 42. g4 Bd3 43. Re6 1/2-1/2
"

// Remove comments and markup from the PGN file
let removeMarkup text = 
    let tagPairs = new Regex(@"\[.*\]")
    let noTagPairs = tagPairs.Replace(text, "")

    let comments = new Regex(@"\{.*\}")
    let noComments = comments.Replace(noTagPairs, "")
    
    // Trim any leading whitespace and convert to a single-line
    noComments.Trim().Replace("\r", "").Replace("\n", " ")
        
// Get the list of moves, each prefixed with a number and one or three dots
let getMoves text =
    let factRegex = new Regex(@"\d+\.+", RegexOptions.Multiline)
    factRegex.Split(text)

let normalizedText = removeMarkup pgnGameText
    
let printGameMoves() = 
    getMoves normalizedText
    |> Array.map (fun move -> move.Trim())
    |> Array.iteri (fun idx move -> printfn "Move %2d: %s" idx move)
(*
OUTPUT:

> printGameMoves();;
Move  0: 
Move  1: e4 e5
Move  2: Nf3 Nc6
Move  3: Bb5
Move  4: a6
Move  5: Ba4 Nf6
Move  6: O-O Be7
Move  7: Re1 b5
Move  8: Bb3 d6
Move  9: c3 O-O
Move 10: h3 Nb8
Move 11: d4 Nbd7
Move 12: c4 c6
Move 13: cxb5 axb5
Move 14: Nc3 Bb7
Move 15: Bg5 b4
Move 16: Nb1 h6
Move 17: Bh4 c5
Move 18: dxe5 Nxe4
Move 19: Bxe7 Qxe7
Move 20: exd6 Qf6
Move 21: Nbd2 Nxd6
Move 22: Nc4 Nxc4
Move 23: Bxc4 Nb6
Move 24: Ne5 Rae8
Move 25: Bxf7+ Rxf7
Move 26: Nxf7 Rxe1+
Move 27: Qxe1 Kxf7
Move 28: Qe3 Qg5
Move 29: Qxg5 hxg5
Move 30: b3 Ke6
Move 31: a3 Kd6
Move 32: axb4 cxb4
Move 33: Ra5 Nd5
Move 34: f3 Bc8
Move 35: Kf2 Bf5
Move 36: Ra7 g6
Move 37: Ra6+ Kc5
Move 38: Ke1 Nf4
Move 39: g3 Nxh3
Move 40: Kd2 Kb5
Move 41: Rd6 Kc5
Move 42: Ra6 Nf2
Move 43: g4 Bd3
Move 44: Re6 1/2-1/2
val it : unit = ()
*)