110,572
社区成员
发帖
与我相关
我的任务
分享
var 源文件=new webclient().downloadstring("http://www.baidu.com");
public static string GetGeneralContent(string url, int index, Encoding encoding = null)
{
string strResult = "";
try
{
Console.WriteLine(string.Format("{0}:开始加载第{1}个页面,页面地址:{2}.", DateTime.Now, index, url));
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
//声明一个HttpWebRequest请求
request.Timeout = 30000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
StreamReader streamReader = new StreamReader(streamReceive, encoding==null?Encoding.Default:encoding);
strResult = streamReader.ReadToEnd();
Console.WriteLine(string.Format("{0}:内容加载完毕.", DateTime.Now));
}
catch (Exception e)
{
Console.WriteLine(e.Message);
}
Console.WriteLine(string.Format("{0}:开始写入HTML文件", DateTime.Now));
using (StreamWriter sw = new StreamWriter(string.Format(Environment.CurrentDirectory + "//{0}.html", index)))
{
sw.WriteLine(strResult);
}
Console.WriteLine(string.Format("{0}:html 文件 已经生成!", DateTime.Now));
Console.WriteLine(string.Format("{0}:数据开始入库", DateTime.Now));
return strResult;
}
其二 若有些网站的内容是异步加载的上面方法将不适用了,请使用下面方法(有些网站需要cookie验证的也包含)
static bool isComplete = false;
static System.Timers.Timer timer = new System.Timers.Timer();
#region CoreMethod
public static string Load(string url, int index, Action<string> setCookie = null)
{
var stopwatch=new Stopwatch();
stopwatch.Start();
Console.WriteLine(string.Format("{0}:开始加载第{1}个页面,页面地址:{2}.", DateTime.Now, index, url));
var browser = new WebBrowser();
if (setCookie != null)
{
setCookie.Invoke(url);
}
browser.ScriptErrorsSuppressed = true;
browser.Navigate(url);
//先要等待加载完毕
Console.WriteLine(string.Format("{0}:开始加载动态内容.", DateTime.Now));
while (browser.ReadyState != WebBrowserReadyState.Complete)
{
Application.DoEvents();
if (stopwatch.Elapsed.Minutes > 3)
{
stopwatch.Reset();
throw new Exception("请求站点超时!");
}
}
timer.Elapsed += new System.Timers.ElapsedEventHandler(Target);
timer.Interval = 1000 * 5;
timer.Start();
//继续等待 5s,等待js加载完
while (!isComplete)
{
Application.DoEvents();
}
HTMLDocument htmldocument = (HTMLDocument)browser.Document.DomDocument;
string gethtml = htmldocument.documentElement.outerHTML;
Console.WriteLine(string.Format("{0}:动态内容加载完毕.", DateTime.Now));
//写入文件
using (StreamWriter sw = new StreamWriter(string.Format(Environment.CurrentDirectory + "//{0}.html", index)))
{
sw.WriteLine(gethtml);
}
return gethtml;
}
private static void Target(object sender, ElapsedEventArgs elapsedEventArgs)
{
isComplete = true;
timer.Stop();
}
其三.获取网页异步请求内容不局限于第二种方法,目前使用casperjs效率更为高效,方法就不提供了,找度娘哈很多demo
第二就是用正则匹配了,建议使用notepad++打开方便查找匹配,
最后导出就不说了