110,499
社区成员
发帖
与我相关
我的任务
分享
wc.Encoding = Encoding.GetEncoding("提取的网站编码");
/// <summary>
/// 获取网页源代码方法四
/// </summary>
/// <param name="url">地址</param>
/// <param name="charSet">指定编码,如果为空,则自动判断</param>
/// <param name="out_str">网页源代码</param>
public static string GetHtml(string url, string charSet)
{
string strWebData = string.Empty;
try
{
WebClient myWebClient = new WebClient(); //创建WebClient实例
byte[] myDataBuffer = myWebClient.DownloadData(url);
strWebData = System.Text.Encoding.Default.GetString(myDataBuffer);
//获取网页字符编码描述信息
if (string.IsNullOrEmpty(charSet))
{
Match charSetMatch = Regex.Match(strWebData, "<meta([^>]*)charset=(\")?(.*)?\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string webCharSet = charSetMatch.Groups[3].Value.Trim().ToLower();
if (webCharSet != "gb2312")
{
webCharSet = "utf-8";
}
if (System.Text.Encoding.GetEncoding(webCharSet) != System.Text.Encoding.Default)
{
strWebData = System.Text.Encoding.GetEncoding(webCharSet).GetString(myDataBuffer);
}
}
}
catch (Exception ex)
{
return null;
}
return strWebData;
}