c#抓取网页内容出现乱码
我做了如下代码读取网页内容,但在分析不同编码的网页时,出现乱码
//取得HTML源码
private string getHtmlInfo(string urlSelet)
{
string strResult = "";
if(urlSelet.Equals(""))
return strResult = "";
//HTML源码
Console.WriteLine("**********************=" + urlSelet);
try
{
//声明一个HttpWebRequest请求
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(urlSelet);
webRequest.Method = "GET";
webRequest.UserAgent = "Opera/9.25 (Windows NT 6.0; U; en)";
HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();
//Encoding encoding = Encoding.GetEncoding("GB2312");
//取得要取得的网页的编码方式
Encoding encoding = GetEncoding(webResponse);
using (System.IO.Stream stream = webResponse.GetResponseStream())
{
using (System.IO.StreamReader reader = new StreamReader(stream,encoding))
{
strResult = reader.ReadToEnd();
}
}
}
catch (Exception exp)
{
MessageBox.Show("出错:" + exp.Message);
}
return strResult;
}
//取得要取得的网页的编码方式
public Encoding GetEncoding(HttpWebResponse response)
{
Encoding code = Encoding.Default;
string charset = null;
//如果发现content-type头
string ctypeLower = response.Headers["content-type"];
string ctypeOrder = response.Headers["Content-Type"];
string ctype="";
if (!ctypeLower.Equals(""))
ctype = ctypeLower;
if (!ctypeOrder.Equals(""))
ctype = ctypeOrder;
Console.WriteLine("ctype:" + ctype);
if (ctype != null)
{
int ind = ctype.IndexOf("charset=");
if (ind != -1)
{
charset = ctype.ToLower().Substring(ind + 8);
}
}
Console.WriteLine("charset编码格式:" + charset);
if (charset != "")
{
try
{
code = Encoding.GetEncoding(charset);
}
catch{}
}
return code;
}
发现取网页编码时有时能取到,有时去不到,所以还会有显示乱码问题
请求哪位大侠帮我一下