using System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using System.Diagnostics;
using System.IO;
using System.Text;
using System.Web;
using System.Net;
引用别人的提取文本的程序,你试试:
s is the source code of the webpage.
private String fetchText(String s)
{
//Filter out HTML and JavaScript from the page, leaving only body text
s = Convert.ToString(Regex.Match(s, @"<body.+?</body>", RegexOptions.Singleline | RegexOptions.IgnoreCase)); //strip everything but <BODY>
s = Regex.Replace(s, "<script[^>]*?>.*?</script>", "", RegexOptions.Singleline | RegexOptions.IgnoreCase); //strip JavaScript
s = Regex.Replace(s, "<[^>]*>", ""); //strip HTML tags
s = Regex.Replace(s, "&(copy|#169);|&(quot|#34);|&(amp|#38);|&(lt|#60);&(gt|#62);|&(nbsp|#160);|&(iexcl|#161);|&(cent|#162);|&(pound|#163);|·", " "); //strip symbols
s = s.Replace("\t", " "); //strip tabs
s = Regex.Replace(s, "([\r\n])+", " "); //strip carriage returns
s = Regex.Replace(s, "\\s\\s+", " "); //strip white space (must be last)
return s.Trim();
}