求C＃提取网页正文内容代码

renyu6277 2007-05-11 03:54:47

哪位大虾有C＃提取网页正文内容的代码，可不可以发上来我参考参考。谢谢啦！！

...全文

926 13 打赏收藏转发到动态举报

写回复

用AI写文章

13 条回复

切换为时间正序

请发表友善的回复…

发表回复

grady.lu 2007-05-11

打赏
举报

真不明白，你是要怎么提取？不同的网页提取的正则表达式完全不一样。如果是同一种类型的网页，就需要写一个表达式。

zhangliu_521 2007-05-11

打赏
举报

(?#Copyright 2005, by Laser Lu.)(?<Style_Block>(?<begin>\<(?<tag>style)(?:\s+(?<attribute>[\w-:]+)(?:=(?<value>[^\s\>\<]*|\"[\s\S]*?\"|\'[\s\S]*?\'))?)*\s*(?:/)?\>)(?<body>[\s\S]*?)(?<end>\</\k<tag>\>))|(?<Script_Block>(?<begin>\<(?<tag>script)(?:\s+(?<attribute>[\w-:]+)(?:=(?<value>[^\s\>\<]*|\"[\s\S]*?\"|\'[\s\S]*?\'))?)*\s*(?:/)?\>)(?<body>[\s\S]*?)(?<end>\</\k<tag>\>))|(?<Xml_Directive>\<!(?<name>[\w-:]+)(?:\s+(?<argument>[\w-:]+|\"[\s\S]*?\"|\'[\s\S]*?\'))*\s*\>)|(?<Xml_Comment>\<!--[\s\S]*?--\>)|(?<Beginning_Tag>\<(?<tag>[\w-:]+)(?:\s+(?<attribute>[\w-:]+)(?:=(?<value>[^\s\>\<]*|\"[\s\S]*?\"|\'[\s\S]*?\'))?)*\s*(?:/)?\>)|(?<Ending_Tag>\</(?<tag>[\w-:]+)\>)|(?<Xml_CDATA>\<!\[CDATA\[(?<data>[\s\S]*?)\]\]\>)|(?<Xml_Literal>(?:(?<blank>[ ]+)|[^ \<\>])+)

Fan52027 2007-05-11

打赏
举报

1.用正则。
2.string.substring(),string.indexof(),etc.

swife 2007-05-11

打赏
举报

用正则表达是提取符合条件的内容

zhangliu_521 2007-05-11

打赏
举报

你总得给出,你源码的格式嘛...

几乎一样,还是很多,不怎么一样啊

renyu6277 2007-05-11

打赏
举报

补充一下，网页我已经下载了，现在只是要提取网页里面的正文内容。

zhangliu_521 2007-05-11

打赏
举报

多线程,还能处理,网络连接异常的...

purplesunshine 2007-05-11

打赏
举报

public static int saveHtmlFile(string url,string filename)
{
int status = -1;
string respHTML = string.Empty;
StreamWriter sw = null;
try
{
if(ReadHttp(url,ref respHTML)=="OK")
{
if(File.Exists(filename))
{
File.Copy(filename,filename+".bak",true);
}
sw = new StreamWriter(filename,false,Encoding.GetEncoding("GB2312"));
sw.WriteLine(respHTML);
sw.Close();
status = 0;
}
else
{
System.Web.HttpContext.Current.Response.Write("找不到该页或服务器错误");
}
}
catch(Exception err)
{
System.Web.HttpContext.Current.Response.Write(err.Message);
status = -1;
}
finally
{
if (sw != null)
{
sw.Close();
}
}
return(status);
}

public static string ReadHttp(string url,ref string content)
{
string status="ERROR";
HttpWebRequest Webreq = (HttpWebRequest) WebRequest.Create(url);
HttpWebResponse Webresp=null;
StreamReader strm = null;
try
{
Webresp = (HttpWebResponse) Webreq.GetResponse();
status = Webresp.StatusCode.ToString();
strm = new StreamReader(Webresp.GetResponseStream(),Encoding.GetEncoding("GB2312"));
content = strm.ReadToEnd();
}
catch
{
}
finally
{
if(Webresp != null) Webresp.Close();
if(strm != null) strm.Close();
}
return(status);
}

zbw9119 2007-05-11

打赏
举报

HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
req.Method = "GET"; or "POST"
req.ContentType = "application/x-www-form-urlencoded";

Stream ReceiveStream = res.GetResponseStream();

kkun_3yue3 2007-05-11

打赏
举报

/// <summary>
/// 重载GetData函数,多线程调用该函数
/// </summary>
/// <param name="en"></param>
public void GetDataOnline()
{
#region 在线程中用到的查询函数
DCurrentState oDCurrentState = new DCurrentState(CurrentState);//声明代理
this.Invoke(oDCurrentState, "Doing", null);
Thread.Sleep(100);

string en = this.textBox1.Text.ToString();
WebRequest oRequest = WebRequest.Create("http://dict.cn/search/?q=" + en);
//oRequest.Timeout = 800;超时时间
WebResponse oResponse = oRequest.GetResponse();
Stream oStream = oResponse.GetResponseStream();
StreamReader oReader = new StreamReader(oStream, Encoding.Default);
string oGetData = oReader.ReadToEnd();
string oRegexPat = @"<big><font\040size=\""2\""\040face=\""Trebuchet\040MS\"">([\w\W]*?)</big>";
Regex oRegex = new Regex(oRegexPat, RegexOptions.IgnoreCase);
Match oMatch = oRegex.Match(oGetData);
#endregion

#region 如果匹配成功
if (oMatch.Success)
{
string[] CurrentData = oRegex.Split(oGetData);
if (CurrentData[1].IndexOf("对不起") >= 0 || CurrentData[1].IndexOf("单词没找到") >= 0)
{
//没有查询到该单词
this.Invoke(oDCurrentState, "Failure", " ");
return;
}
else
{
//查询到有数据,如果自动更新开关为on，
if (this.autoUpdate.Checked)
{
#region 更新本地数据
this.Invoke(oDCurrentState, "Updated", CurrentData[1].Replace("<br>", "\r\n"));
Dict d = new Dict(en);
d._En = en;
d._Cn = CurrentData[1];
d.Update(d);
#endregion
}
else
{
#region 保存结果
this.Invoke(oDCurrentState, "Finish", CurrentData[1].Replace("<br>", "\r\n"));
Dict d = new Dict(en);
d._En = en;
d._Cn = CurrentData[1];

try
{
d.Insert(d);
}
catch (Exception exception)
{

throw exception;
}
finally
{
d = null;
}
#endregion
}
return;
}
}
#endregion
return;

}