1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
| '' Remove HTML Development formatting
'' Replace line breaks with space
'' because browsers inserts space
result = source.Replace("\r", " ")
'' Replace line breaks with space
'' because browsers inserts space
result = result.Replace("\n", " ")
'' Remove step-formatting
result = result.Replace("\t", String.Empty)
'' Remove repeating speces becuase browsers ignore them
result = System.Text.RegularExpressions.Regex.Replace(result, "( )+", " ")
'' Remove the header (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*head([^>])*>", "<head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(<( )*(/)( )*head( )*>)", "</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)", String.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' remove all scripts (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*script([^>])*>", "<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(<( )*(/)( )*script( )*>)", "</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(<script>).*(</script>)", String.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' remove all styles (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*style([^>])*>", "<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(<( )*(/)( )*style( )*>)", "</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)", String.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' insert tabs in spaces of <td> tags
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*td([^>])*>", "\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' insert line breaks in places of <BR> and <LI> tags
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*br( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*li( )*>", "\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' insert line paragraphs (double line breaks) in place
'' if <P>, <DIV> and <TR> tags
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*div([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*tr([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "<( )*p([^>])*>", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' Remove remaining tags like <a>, links, images,
'' comments etc - anything thats enclosed inside < >
result = System.Text.RegularExpressions.Regex.Replace(result, "<[^>]*>", String.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase)
''replace special characters:
result = System.Text.RegularExpressions.Regex.Replace(result, " ", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "•", " * ", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "‹", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "›", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "™", "(tm)", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "⁄", "/", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "<", "<", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, ">", ">", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "©", "(c)", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "®", "(r)", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' Remove all others. More can be added, see
'' http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = System.Text.RegularExpressions.Regex.Replace(result, "&(.{2,6});", String.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' for testng
''System.Text.RegularExpressions.Regex.Replace(result, this.txtRegex.Text,string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' make line breaking consistent
result = result.Replace("\n", "\r")
'' Remove extra line breaks and tabs:
'' replace over 2 breaks with 2 and over 4 tabs with 4.
'' Prepare first to remove any whitespaces inbetween
'' the escaped characters and remove redundant tabs inbetween linebreaks
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)", "\t\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)", "\t\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' Remove redundant tabs
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)", "\r\r", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' Remove multible tabs followind a linebreak with just one tab
result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+", "\r\t", System.Text.RegularExpressions.RegexOptions.IgnoreCase)
'' Initial replacement target string for linebreaks
Dim breaks As String = "\r\r\r"
'' Initial replacement target string for tabs
Dim tabs As String = "\t\t\t\t\t"
Dim index As Integer
For index = 0 To index < result.Length
result = result.Replace(breaks, "\r\r")
result = result.Replace(tabs, "\t\t\t\t")
breaks = breaks + "\r"
tabs = tabs + "\t"
Next |
Partager