110,561
社区成员
发帖
与我相关
我的任务
分享
public class GZipWebClient : WebClient
{
protected override WebRequest GetWebRequest(Uri address)
{
HttpWebRequest request = (HttpWebRequest)base.GetWebRequest(address);
request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
return request;
}
}
或者根据响应头来使用不同的方法读取内容
参考: System.Net.WebClient.DownloadString chunked gzip
private static void DownloadSync(Uri address, Stream outputStream, int bufferSize = 4096)
{
using (var webClient = new WebClient())
using (var responseStream = webClient.OpenRead(address))
{
var responseHeaders = webClient.ResponseHeaders;
Stream decompressedStream;
if
(
responseHeaders[HttpResponseHeader.TransferEncoding]=="chunked" &&
responseHeaders[HttpResponseHeader.ContentEncoding]=="gzip"
)
{
decompressedStream = new GZipStream(responseStream, CompressionMode.Decompress, false);
}
else
{
decompressedStream = responseStream;
}
using (decompressedStream)
{
var buffer = new byte[bufferSize];
while (true)
{
var amount = decompressedStream.Read(buffer, 0, bufferSize);
if (amount==0)
{
return;
}
outputStream.Write(buffer, 0, amount);
}
}
}
}
建议使用两个开源的类库
苏飞的C#HttpHelper是挺好的抓取类库
Html Agility Pack是很好用的html解析类库
我是用 WebClient MyWebClient1 = new WebClient(); 通过正则表达式采集的 没有处理gzip压缩等情况[/quote]