求大神帮助!!!C# ,瀑布流式网页爬虫。
瀑布流式的网络爬虫,怎么模拟滚动条下滑,触发加载?我是菜鸟一个,刚学习网络爬虫,希望各位大神帮帮忙!!!
下面是代码,请不吝指导!!!
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net.Sockets;
using System.Windows.Forms;
using mshtml;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
using System.Drawing;
namespace ConsoleApplication1
{
class Program
{
static WebBrowser wb;
static HtmlDocument htmldoc;
static int eventcounter = 0;//加载完成次数
static int navinum = 0;//导航次数
static int windownum = 0;
static int height;//文档窗口高度
static int max = 2;//加载最大次数
//static ulong donum = 0;
//static bool istimeout = false;
//static List<string> todo = new List<string>(); //要访问的链接
//static List<string> visited = new List<string>(); //已经访问过的链接
static string startPointAdress = "http://www.pinterest.com/";
static List<string> urllist = new List<string>();
static List<string> jpglist = new List<string>();
static List<string> itemlist = new List<string>();
static string s;
[STAThread]
static void Main(string[] args)
{
wb = new WebBrowser();
wb.ScriptErrorsSuppressed = true;
wb.Navigate(startPointAdress);
wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted);
wb.Navigating += new WebBrowserNavigatingEventHandler(wb_Navigating);
wb.NewWindow += new System.ComponentModel.CancelEventHandler(wb_NewWindow);
while (true)
{
Application.DoEvents();
//donum++;
//Console.WriteLine(donum);
}
}
static void wb_NewWindow(object sender, System.ComponentModel.CancelEventArgs e)
{
//throw new NotImplementedException();
e.Cancel = true;
windownum++;
Console.WriteLine("windownum is {0}",windownum);
}
static void wb_Navigating(object sender, WebBrowserNavigatingEventArgs e)
{
//throw new NotImplementedException();
navinum++;
Console.WriteLine("navinum is {0}",navinum);
}
static void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
//throw new NotImplementedException();
eventcounter++;
htmldoc = wb.Document;
height = wb.Document.Body.ScrollRectangle.Height;
htmldoc.Window.ScrollTo(0,height);//滚动条控制,不知道对不对?
Console.WriteLine("completenum is {0}", eventcounter);
if (eventcounter >= max)
{
eventcounter = 0;
s = wb.Document.Body.InnerHtml;
stringtotxt(s, @"d:\html.txt");
int len = s.Length;
Console.WriteLine("length is {0}",len);
urllist.AddRange(GetLinks(s));
listtotxt(urllist, @"d:\url.txt");
jpglist.AddRange(Getjpgs(s));
listtotxt(jpglist, @"d:\jpg.txt");
itemlist.AddRange(Getitems(s));
listtotxt(itemlist, @"d:\item.txt");
Console.ReadLine();
}
}
static List<string> Getjpgs(string htmlPage)
{
//通过正则表达式提取链接
Regex regx_jpg = new Regex("href=\"/pin/[0-9]*/\"", RegexOptions.IgnoreCase);
MatchCollection matches_jpg = regx_jpg.Matches(htmlPage);
List<string> results_jpg = new List<string>();
foreach (Match match in matches_jpg)
{
//if (!visited.Contains(match.Value))
// results.Add(match.Value);
results_jpg.Add(match.Value);
}
return results_jpg;
}
static List<string> GetLinks(string htmlPage)
{
//通过正则表达式提取链接
Regex regx = new Regex("http://([\\w+?\\.\\w+])+([a-zA-Z0-9\\~\\!\\@\\#\\$\\%\\^\\&\\*\\(\\)_\\-\\=\\+\\\\\\/\\?\\.\\:\\;\\'\\,]*)?", RegexOptions.IgnoreCase);
MatchCollection matches = regx.Matches(htmlPage);
List<string> results = new List<string>();
foreach (Match match in matches)
{
//if (!visited.Contains(match.Value))
// results.Add(match.Value);
results.Add(match.Value);
}
return results;
}
static List<string> Getitems(string htmlPage)
{
//通过正则表达式提取链接
Regex regx_item = new Regex("<div class=\"item \"", RegexOptions.IgnoreCase);
MatchCollection matches_item = regx_item.Matches(htmlPage);
List<string> results_item = new List<string>();
foreach (Match match in matches_item)
{
//if (!visited.Contains(match.Value))
// results.Add(match.Value);
results_item.Add(match.Value);
}
return results_item;
}
//将List转换为TXT文件
static void listtotxt(List<string> list, string txtFile)
{
//创建一个文件流,用以写入或者创建一个StreamWriter
FileStream fs = new FileStream(txtFile, FileMode.OpenOrCreate, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs);
sw.Flush();
// 使用StreamWriter来往文件中写入内容
sw.BaseStream.Seek(0, SeekOrigin.Begin);
for (int i = 0; i < list.Count; i++) sw.WriteLine(list[i]);
//关闭此文件t
sw.Flush();
sw.Close();
fs.Close();
}
//将string转换为TXT文件
static void stringtotxt(string s, string txtFile)
{
//创建一个文件流,用以写入或者创建一个StreamWriter
FileStream fs = new FileStream(txtFile, FileMode.OpenOrCreate, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs);
sw.Flush();
// 使用StreamWriter来往文件中写入内容
sw.BaseStream.Seek(0, SeekOrigin.Begin);
sw.Write(s);
//关闭此文件t
sw.Flush();
sw.Close();
fs.Close();
}
}
}