WebBrowser模拟点击爬去分页数据,怎么控制点击次数

往事只能回味味道 2018-03-15 04:37:15
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Web;
using System.Text;
using HtmlAgilityPack;
using Conn;

namespace DataApp
{
public partial class MainForm : Form
{
string url = @"http://117.159.3.6:9035/QueryWeb/";
int count = 0;
DataTable dt1 = new DataTable("t1");//企业信息
DataTable dt2 = new DataTable("t2");//企业资质
public MainForm()
{
InitializeComponent();
}
private void MainForm_Load(object sender, EventArgs e)
{
////企业信息
//dt1.Columns.Add("企业名称", Type.GetType("System.String"));
//dt1.Columns.Add("统一信用代码", Type.GetType("System.String"));
//dt1.Columns.Add("注册地址", Type.GetType("System.String"));
//dt1.Columns.Add("企业类型", Type.GetType("System.String"));
//dt1.Columns.Add("注册日期", Type.GetType("System.String"));
//dt1.Columns.Add("营业地址", Type.GetType("System.String"));
//dt1.Columns.Add("营业地址邮编", Type.GetType("System.String"));
//dt1.Columns.Add("法定代表人", Type.GetType("System.String"));
//dt1.Columns.Add("官网", Type.GetType("System.String"));
////资质信息
//dt2.Columns.Add("企业名称", Type.GetType("System.String"));
//dt2.Columns.Add("资质类型", Type.GetType("System.String"));
//dt2.Columns.Add("资质证书编号", Type.GetType("System.String"));
//dt2.Columns.Add("发证机关", Type.GetType("System.String"));
//dt2.Columns.Add("发证日期", Type.GetType("System.String"));
//dt2.Columns.Add("有效期至", Type.GetType("System.String"));
//dt2.Columns.Add("资质范围", Type.GetType("System.String"));
}

//开始采集
private void BtnCai_Click(object sender, EventArgs e)
{
string jzurl = "query11.aspx?type=&typeNum=7&Province=1";//建筑企业
//string sjurl = "query11.aspx?type=工程设计&typeNum=2&Province=1";//设计企业
//string wsurl = "query41.aspx?Province=2";//外省企业
webBrowser1.Navigate(url + jzurl);//加载url
webBrowser1.Navigated += new WebBrowserNavigatedEventHandler(Web_Navigated);
webBrowser1.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(Web_DocumentCompleted); //装载WebBrowser.DocumentCompleted事件;
}
private void Web_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
count = count - 1;
if (0 == count)
{
GetInfoByDOM(webBrowser1);
System.Windows.Forms.HtmlDocument htdoc = webBrowser1.Document;
HtmlElement htmlcounts = htdoc.GetElementById("ctl00_ContentPlaceHolder1_GridView1_ctl13_Label4");
HtmlElement htmlpages = htdoc.GetElementById("lblPageCount");
HtmlElement htmlpagesindex = htdoc.GetElementById("ctl00_ContentPlaceHolder1_GridView1_ctl13_lblPageIndex");
HtmlElement btnclicktag = htdoc.GetElementById("ctl00_ContentPlaceHolder1_GridView1_ctl13_btnNext");
lblCounts.Text = htmlcounts.InnerText;//总数据数
LblPages.Text = htmlpages.InnerText;//总页数
lblPageIndex.Text = htmlpagesindex.InnerText;//第几页
btnclicktag.InvokeMember("click");//执行下一页点击事件
}
}
private void Web_Navigated(object sender, WebBrowserNavigatedEventArgs e)
{
count++;
}
/// <summary>
/// 企业基本信息采集
/// </summary>
/// <param name="par"></param>
public void GetBasicInfo(string par)
{
var html = url + "CorpDetails.aspx?" + par;
var web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument htmlDoc = web.Load(html);
DataRow dr = dt1.NewRow();
dr["企业名称"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label10").InnerText;
dr["统一信用代码"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label3").InnerText;
dr["注册地址"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label1").InnerText;
dr["企业类型"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label2").InnerText;
dr["注册日期"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label4").InnerText;
dr["营业地址"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label6").InnerText;
dr["营业地址邮编"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label7").InnerText;
dr["法定代表人"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label8").InnerText;
dr["官网"] = htmlDoc.GetElementbyId("ctl00_ContentPlaceHolder1_FormView1_Label13").InnerText;
dt1.Rows.Add(dr);
GDV.DataSource = dt1;
}
//测试数据库连接
private void BtnData_Click(object sender, EventArgs e)
{
string sql = "SELECT * FROM ims_hulu_info_shop";
DataSet ds = DbHelperMySQL.Query(sql);
DataTable dt = ds.Tables[0];
GDV.DataSource = dt;
}

//采集企业基本信息数据入库dt1
private void GetInfoByDOM(WebBrowser WebBro)
{
var Doc = new HtmlAgilityPack.HtmlDocument();
Doc.LoadHtml(WebBro.DocumentText);
var res = Doc.GetElementbyId("ctl00_ContentPlaceHolder1_GridView1");//表格
if (res != null)
{
var trs = res.SelectNodes(@"tr");//获取所有行
trs.RemoveAt(0);//移除第一行,是表头
for (int r = 0; r < trs.Count - 1; r++)
{
HtmlNodeCollection tds = trs[r].SelectNodes(@"td");//td
if (tds != null)
{
for (int d = 0; d < tds.Count; d++)
{
if (d == 1)
{
//GetBasicInfo(GetHtmlAHref(tds[1].InnerHtml));//基本信息
GetCertByUrl(GetHtmlAHref(tds[1].InnerHtml),tds[1].InnerText);//资质信息
}
}

}

}
}
}
//采集企业资质信息数据入库dt2
private void GetCertByUrl(string par,string name)
{
var html = url + "SubCorpCert.aspx?" + par;
var web = new HtmlWeb();
HtmlAgilityPack.HtmlDocument htmlDoc = web.Load(html);
var res = htmlDoc.GetElementbyId("DataList1");//表格
if (res != null)
{
var trs = res.SelectNodes(@"tr");//获取所有行
for (int r = 0; r < trs.Count; r++)
{
DataRow dr = dt2.NewRow();
var tds = trs[r].SelectNodes(@"td");//获取所有列
for (int d = 0; d < tds.Count; d++)
{
dr["企业名称"] = name;
dr["资质类型"] = GetInfoByDocStr(tds[0].InnerHtml,"DataList1_ctl0"+r+"_CertType");
dr["资质证书编号"] = GetInfoByDocStr(tds[0].InnerHtml, "DataList1_ctl0" + r + "_CertIDLabel");
dr["发证机关"] = GetInfoByDocStr(tds[0].InnerHtml, "DataList1_ctl0" + r + "_OrganNameLabel");
dr["发证日期"] = GetInfoByDocStr(tds[0].InnerHtml, "DataList1_ctl0" + r + "_Label3");
dr["有效期至"] = GetInfoByDocStr(tds[0].InnerHtml, "DataList1_ctl0" + r + "_Label1");
dr["资质范围"] = GetInfoByDocStr(tds[0].InnerHtml, "DataList1_ctl0" + r + "_Label2");
}
dt2.Rows.Add(dr);
GDV.DataSource = dt2;//每读取一个table插入数据库
}
}
}
//采集企业人员信息数据入库dt3
//采集企业中标信息数据入库dt4
//采集企业良坏信息数据入库dt5
/// <summary>
/// 从html文章Table字符串中返回指定ID的文本
/// </summary>
/// <param name="table"></param>
/// <param name="ID"></param>
/// <returns></returns>
private string GetInfoByDocStr(string table,string ID)
{
var Doc = new HtmlAgilityPack.HtmlDocument();
Doc.LoadHtml(table);
var res = Doc.GetElementbyId(ID);
return res.InnerText;
}
/// <summary>
/// 获取超链接的参数值
/// </summary>
/// <param name="htmla"></param>
/// <returns></returns>
public string GetHtmlAHref(string htmla)
{
string reg = @"<a[^>]*href=([""'])?(?<href>[^'""]+)\1[^>]*>";
var item = Regex.Match(htmla, reg, RegexOptions.IgnoreCase);
int strindex = item.Groups["href"].Value.IndexOf("?");
return item.Groups["href"].Value.Substring(strindex + 1).Replace("&", "&");
}
}

}
...全文
304 6 打赏 收藏 转发到动态 举报
写回复
用AI写文章
6 条回复
切换为时间正序
请发表友善的回复…
发表回复
FainSheeg 2018-12-18
  • 打赏
  • 举报
回复
引用 5 楼 Jacky0319 的回复:
谢谢孤独侠 分享! 我也为这个问题 烦恼了一周。 如何抓取下面网址 高管名单 这个 表格的 第2,3, 4..最后页呢 ? 找了 10多篇文章,没有结果
http://data.10jqka.com.cn/financial/ggjy/

给你个网址:http://data.10jqka.com.cn/ajax/ggjy/field/enddate/order/desc/page/3/ajax/1/
要切换页码就改Page后面的数字
Jacky0319 2018-12-17
  • 打赏
  • 举报
回复
谢谢孤独侠 分享! 我也为这个问题 烦恼了一周。 如何抓取下面网址 高管名单 这个 表格的 第2,3, 4..最后页呢 ? 找了 10多篇文章,没有结果
http://data.10jqka.com.cn/financial/ggjy/
孤独侠 2018-12-15
  • 打赏
  • 举报
回复
这是我以前写的利用timer控件来查询分页: private void timer1_Tick(object sender, EventArgs e) { GetHsData(); btnNext.InvokeMember("click"); } private void btnDownload_Click(object sender, EventArgs e) { if (GetQueryForm()) { //tboxHS.SetAttribute("value", "38089119"); //btnSubmit.InvokeMember("click"); if (GetNextPage()) { timer1.Enabled = true; timer1.Interval = 20000; timer1.Start(); } } ////tboxStartDate.SetAttribute("value", "2017-01-01"); ////tboxEndDate.SetAttribute("value", "2017-12-31"); // } private bool GetNextPage() { btnNext = null; HtmlElementCollection htmlele = wbrMain.Document.GetElementsByTagName("a"); foreach (HtmlElement item in htmlele) { if (item.OuterHtml.IndexOf("下一页") > 0) { btnNext = item; } } htmlele = null; if (btnNext != null) { return true; } else return false; } private void GetHsData() { HtmlElementCollection tbs = wbrMain.Document.GetElementsByTagName("TABLE"); foreach (HtmlElement tb in tbs) { HtmlElementCollection trs = tb.GetElementsByTagName("TR"); foreach (HtmlElement tr in trs) { HtmlElementCollection tds = tr.GetElementsByTagName("TD"); if (tds.Count > 0) { DataRow dr = HsData.NewRow(); for (int i = 0; i < tds.Count; i++) { dr["ID"] = AutoPrimaryID.GenerateStringID(); dr["ITEM_NO"] = tds[0].InnerText; dr["IE_DATE"] = tds[1].InnerText; dr["HS_CODE"] = tds[2].InnerText; dr["OWNER_NAME"] = tds[3].InnerText; dr["PRODUCT_DESC"] = tds[4].InnerText; dr["COUNTRY_NAME"] = tds[5].InnerText; dr["CUSTOMS_NAME"] = tds[6].InnerText; dr["ORIGIN_AREA"] = tds[7].InnerText; dr["DOLLAR_CURR"] = tds[8].InnerText; dr["QTY_UNIT"] = tds[9].InnerText; } HsData.Rows.Add(dr); } } } CMMBLL.UpateData("HS_TEMP", HsData); HsData.AcceptChanges(); HsData.Clear(); HsData.AcceptChanges(); } private void wbrMain_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e) { if (onLogin) { btnDownload.Enabled = true; isLogin = true; } else Login(); }
Jacky0319 2018-12-15
  • 打赏
  • 举报
回复
兄弟太急了,一开始上代码, 应该 先简单描述问题的要点, 大神们才好帮你呀。 现在解决了否?
  • 打赏
  • 举报
回复
没有人急急急急急急急急急
  • 打赏
  • 举报
回复
??个急急急急急急急急急急急急急急急急急急

110,538

社区成员

发帖
与我相关
我的任务
社区描述
.NET技术 C#
社区管理员
  • C#
  • Web++
  • by_封爱
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告

让您成为最强悍的C#开发者

试试用AI创作助手写篇文章吧