WebBrowser模拟点击爬去分页数据,怎么控制点击次数
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Web;
using System.Text;
using HtmlAgilityPack;
using Conn;
namespace DataApp
{
public partial class MainForm : Form
{
string url = @"http://117.159.3.6:9035/QueryWeb/";
int count = 0;
DataTable dt1 = new DataTable("t1");//企业信息
DataTable dt2 = new DataTable("t2");//企业资质
public MainForm()
{
InitializeComponent();
}
private void MainForm_Load(object sender, EventArgs e)
{
////企业信息
//dt1.Columns.Add("企业名称", Type.GetType("System.String"));
//dt1.Columns.Add("统一信用代码", Type.GetType("System.String"));
//dt1.Columns.Add("注册地址", Type.GetType("System.String"));
//dt1.Columns.Add("企业类型", Type.GetType("System.String"));
//dt1.Columns.Add("注册日期", Type.GetType("System.String"));
//dt1.Columns.Add("营业地址", Type.GetType("System.String"));
//dt1.Columns.Add("营业地址邮编", Type.GetType("System.String"));
//dt1.Columns.Add("法定代表人", Type.GetType("System.String"));
//dt1.Columns.Add("官网", Type.GetType("System.String"));
////资质信息
//dt2.Columns.Add("企业名称", Type.GetType("System.String"));
//dt2.Columns.Add("资质类型", Type.GetType("System.String"));
//dt2.Columns.Add("资质证书编号", Type.GetType("System.String"));
//dt2.Columns.Add("发证机关", Type.GetType("System.String"));
//dt2.Columns.Add("发证日期", Type.GetType("System.String"));
//dt2.Columns.Add("有效期至", Type.GetType("System.String"));
//dt2.Columns.Add("资质范围", Type.GetType("System.String"));
}
//开始采集
private void BtnCai_Click(object sender, EventArgs e)
{
string jzurl = "query11.aspx?type=&typeNum=7&Province=1";//建筑企业
//string sjurl = "query11.aspx?type=工程设计&typeNum=2&Province=1";//设计企业
//string wsurl = "query41.aspx?Province=2";//外省企业
webBrowser1.Navigate(url + jzurl);//加载url
webBrowser1.Navigated += new WebBrowserNavigatedEventHandler(Web_Navigated);
webBrowser1.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(Web_DocumentCompleted); //装载WebBrowser.DocumentCompleted事件;
}
private void Web_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
count = count - 1;
if (0 == count)
{
GetInfoByDOM(webBrowser1);
System.Windows.Forms.HtmlDocument htdoc = webBrowser1.Document;
HtmlElement htmlcounts = htdoc.GetElementById("ctl00_ContentPlaceHolder1_GridView1_ctl13_Label4");
HtmlElement htmlpages = htdoc.GetElementById("lblPageCount");
HtmlElement htmlpagesindex = htdoc.GetElementById("ctl00_ContentPlaceHolder1_GridView1_ctl13_lblPageIndex");
HtmlElement btnclicktag = htdoc.GetElementById("ctl00_ContentPlaceHolder1_GridView1_ctl13_btnNext");
lblCounts.Text = htmlcounts.InnerText;//总数据数
LblPages.Text = htmlpages.InnerText;//总页数
lblPageIndex.Text = htmlpagesindex.InnerText;//第几页
btnclicktag.InvokeMember("click");//执行下一页点击事件
}
}
private void Web_Navigated(object sender, WebBrowserNavigatedEventArgs e)
{
count++;
}
/// <summary>
/// 企业基本信息采集
/// </summary>
/// <param name="par"></param>
public void GetBasicInfo(string par)
{
var html = url + "CorpDetails.aspx?" + par;
var web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument htmlDoc = web.Load(html);
DataRow dr = dt1.NewRow();
dr["企业名称"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label10").InnerText;
dr["统一信用代码"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label3").InnerText;
dr["注册地址"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label1").InnerText;
dr["企业类型"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label2").InnerText;
dr["注册日期"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label4").InnerText;
dr["营业地址"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label6").InnerText;
dr["营业地址邮编"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label7").InnerText;
dr["法定代表人"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label8").InnerText;
dr["官网"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label13").InnerText;
dt1.Rows.Add(dr);
GDV.DataSource = dt1;
}
//测试数据库连接
private void BtnData_Click(object sender, EventArgs e)
{
string sql = "SELECT * FROM ims_hulu_info_shop";
DataSet ds = DbHelperMySQL.Query(sql);
DataTable dt = ds.Tables[0];
GDV.DataSource = dt;
}
//采集企业基本信息数据入库dt1
private void GetInfoByDOM(WebBrowser WebBro)
{
var Doc = new HtmlAgilityPack.HtmlDocument();
Doc.LoadHtml(WebBro.DocumentText);
var res = Doc.GetElementbyId("ctl00_ContentPlaceHolder1_GridView1");//表格
if (res != null)
{
var trs = res.SelectNodes(@"tr");//获取所有行
trs.RemoveAt(0);//移除第一行,是表头
for (int r = 0; r < trs.Count - 1; r++)
{
HtmlNodeCollection tds = trs[r].SelectNodes(@"td");//td
if (tds != null)
{
for (int d = 0; d < tds.Count; d++)
{
if (d == 1)
{
//GetBasicInfo(GetHtmlAHref(tds[1].InnerHtml));//基本信息
GetCertByUrl(GetHtmlAHref(tds[1].InnerHtml),tds[1].InnerText);//资质信息
}
}
}
}
}
}
//采集企业资质信息数据入库dt2
private void GetCertByUrl(string par,string name)
{
var html = url + "SubCorpCert.aspx?" + par;
var web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument htmlDoc = web.Load(html);
var res = htmlDoc.GetElementbyId("DataList1");//表格
if (res != null)
{
var trs = res.SelectNodes(@"tr");//获取所有行
for (int r = 0; r < trs.Count; r++)
{
DataRow dr = dt2.NewRow();
var tds = trs[r].SelectNodes(@"td");//获取所有列
for (int d = 0; d < tds.Count; d++)
{
dr["企业名称"] = name;
dr["资质类型"] = GetInfoByDocStr(tds[0].InnerHtml,"DataList1_ctl0"+r+"_CertType");
dr["资质证书编号"] = GetInfoByDocStr(tds[0].InnerHtml, "DataList1_ctl0" + r + "_CertIDLabel");
dr["发证机关"] = GetInfoByDocStr(tds[0].InnerHtml, "DataList1_ctl0" + r + "_OrganNameLabel");
dr["发证日期"] = GetInfoByDocStr(tds[0].InnerHtml, "DataList1_ctl0" + r + "_Label3");
dr["有效期至"] = GetInfoByDocStr(tds[0].InnerHtml, "DataList1_ctl0" + r + "_Label1");
dr["资质范围"] = GetInfoByDocStr(tds[0].InnerHtml, "DataList1_ctl0" + r + "_Label2");
}
dt2.Rows.Add(dr);
GDV.DataSource = dt2;//每读取一个table插入数据库
}
}
}
//采集企业人员信息数据入库dt3
//采集企业中标信息数据入库dt4
//采集企业良坏信息数据入库dt5
/// <summary>
/// 从html文章Table字符串中返回指定ID的文本
/// </summary>
/// <param name="table"></param>
/// <param name="ID"></param>
/// <returns></returns>
private string GetInfoByDocStr(string table,string ID)
{
var Doc = new HtmlAgilityPack.HtmlDocument();
Doc.LoadHtml(table);
var res = Doc.GetElementbyId(ID);
return res.InnerText;
}
/// <summary>
/// 获取超链接的参数值
/// </summary>
/// <param name="htmla"></param>
/// <returns></returns>
public string GetHtmlAHref(string htmla)
{
string reg = @"<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*>";
var item = Regex.Match(htmla, reg, RegexOptions.IgnoreCase);
int strindex = item.Groups["href"].Value.IndexOf("?");
return item.Groups["href"].Value.Substring(strindex + 1).Replace("&", "&");
}
}
}