Private Function ConvertHTMLToText(ByVal Source As String) As String
Dim result As String = Source
' Remove formatting that will prevent regex from running reliably
' \r - Matches a carriage return \u000D.
' \n - Matches a line feed \u000A.
' \f - Matches a form feed \u000C.
' For more details see http://msdn.microsoft.com/en-us/library/4edbef7e.aspx
result = Replace(result, "[\r\n\f]", String.Empty, Text.RegularExpressions.RegexOptions.IgnoreCase)
' replace the most commonly used special characters:
result = Replace(result, "<", "<", RegexOptions.IgnoreCase)
result = Replace(result, ">", ">", RegexOptions.IgnoreCase)
result = Replace(result, " ", " ", RegexOptions.IgnoreCase)
result = Replace(result, """, """", RegexOptions.IgnoreCase)
result = Replace(result, "&", "&", RegexOptions.IgnoreCase)
' Remove ASCII character code sequences such as &#nn; and &#nnn;
result = Replace(result, "&#[0-9]{2,3};", String.Empty, RegexOptions.IgnoreCase)
' Remove all other special characters. More can be added - see the following for more details:
' http://www.degraeve.com/reference/specialcharacters.php
' http://www.web-source.net/symbols.htm
result = Replace(result, "&.{2,6};", String.Empty, RegexOptions.IgnoreCase)
' Remove all attributes and whitespace from the <head> tag
result = Replace(result, "< *head[^>]*>", "<head>", RegexOptions.IgnoreCase)
' Remove all whitespace from the </head> tag
result = Replace(result, "< */ *head *>", "</head>", RegexOptions.IgnoreCase)
' Delete everything between the <head> and </head> tags
result = Replace(result, "<head>.*</head>", String.Empty, RegexOptions.IgnoreCase)
' Remove all attributes and whitespace from all <script> tags
result = Replace(result, "< *script[^>]*>", "<script>", RegexOptions.IgnoreCase)
' Remove all whitespace from all </script> tags
result = Replace(result, "< */ *script *>", "</script>", RegexOptions.IgnoreCase)
' Delete everything between all <script> and </script> tags
result = Replace(result, "<script>.*</script>", String.Empty, RegexOptions.IgnoreCase)
' Remove all attributes and whitespace from all <style> tags
result = Replace(result, "< *style[^>]*>", "<style>", RegexOptions.IgnoreCase)
' Remove all whitespace from all </style> tags
result = Replace(result, "< */ *style *>", "</style>", RegexOptions.IgnoreCase)
' Delete everything between all <style> and </style> tags
result = Replace(result, "<style>.*</style>", String.Empty, RegexOptions.IgnoreCase)
' Insert tabs in place of <td> tags
result = Replace(result, "< *td[^>]*>", vbTab, RegexOptions.IgnoreCase)
' Insert single line breaks in place of <br> and <li> tags
result = Replace(result, "< *br[^>]*>", vbCrLf, RegexOptions.IgnoreCase)
result = Replace(result, "< *li[^>]*>", vbCrLf, RegexOptions.IgnoreCase)
' Insert double line breaks in place of <p>, <div> and <tr> tags
result = Replace(result, "< *div[^>]*>", vbCrLf + vbCrLf, RegexOptions.IgnoreCase)
result = Replace(result, "< *tr[^>]*>", vbCrLf + vbCrLf, RegexOptions.IgnoreCase)
result = Replace(result, "< *p[^>]*>", vbCrLf + vbCrLf, RegexOptions.IgnoreCase)
' Remove all reminaing html tags
result = Replace(result, "<[^>]*>", String.Empty, RegexOptions.IgnoreCase)
' Replace repeating spaces with a single space
result = Replace(result, " +", " ")
' Remove any trailing spaces and tabs from the end of each line
result = Replace(result, "[ \t]+\r\n", vbCrLf)
' Remove any leading whitespace characters
result = Replace(result, "^[\s]+", String.Empty)
' Remove any trailing whitespace characters
result = Replace(result, "[\s]+$", String.Empty)
' Remove extra line breaks if there are more than two in a row
result = Replace(result, "\r\n\r\n(\r\n)+", vbCrLf + vbCrLf)
' Thats it.
Return result
End Function