62,046
社区成员
发帖
与我相关
我的任务
分享
public static void GetWebContent(string url)
{
try
{
HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(url);
//myHttpWebRequest.SendChunked = true;
//myHttpWebRequest.TransferEncoding = "gb2312";
HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse();
myHttpWebRequest.ContentType = "text/html";
//Console.WriteLine("The encoding method used is: " + myHttpWebResponse.ContentEncoding);
Console.WriteLine("字符编码:" + myHttpWebResponse.CharacterSet);
Stream webStream = myHttpWebResponse.GetResponseStream();
string encoding = myHttpWebResponse.CharacterSet;
//经过大量测试发现.NET会将gb2312的编码格式识别为ISO-8859-1,故作此处理
encoding = (encoding == "ISO-8859-1") ? "gb2312" : encoding;
//获取的内容是否属于文本
char seperator = '/';
String contenttype = myHttpWebResponse.ContentType;
// 返回 'text' 如果文本类型是'text/html.
String maintype = contenttype.Substring(0, contenttype.IndexOf(seperator));
// 只保存内容为'text'类型的网页,不保存示图片等其他文件
if (String.Compare(maintype, "text") == 0)
{
Console.WriteLine("\n Content type is 'text'.");
StreamReader sw = new StreamReader(webStream, System.Text.Encoding.GetEncoding(encoding));
Console.WriteLine("网页内容:");
StringBuilder sb = new StringBuilder();
while (sw.Peek() >= 0)
{
string strWebContent = sw.ReadLine();
sb.Append(strWebContent);
}
sw.Close();
sw.Dispose();
webStream.Close();
Console.WriteLine(sb.ToString());
}
else
{
Console.Write("网页内容不是文本无法显示!");
}
}
catch (Exception ex)
{
Console.WriteLine("Error:" + ex.Message);
}
}
/// <summary>
/// 根据 url 获取网页编码
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetEncoding(string url)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 20000;
request.AllowAutoRedirect = false;
response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);
string html = reader.ReadToEnd();
Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
if (reg_charset.IsMatch(html))
{
return reg_charset.Match(html).Groups["charset"].Value;
}
else if (response.CharacterSet != string.Empty)
{
return response.CharacterSet;
}
else
return Encoding.Default.BodyName;
}
}
catch
{
}
finally
{
if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close();
if (request != null)
request = null;
}
return Encoding.Default.BodyName;
}