62,074
社区成员
发帖
与我相关
我的任务
分享
//文件名称
public string FileName = DateTime.Now.ToString("yyyyMMddHHmmssffff");
//count:总数 cg:成功 sb:失败 yc:异常 cf:重复
public int count = 0, cg = 0, sb = 0, yc = 0, cf = 0;
protected void Page_Load(object sender, EventArgs e)
{
DateTime timer = DateTime.Now;//记录此次采集开始计时
ArrayList titleList = GetMainPage();
//创建一个临时表
System.Data.DataTable dt = new System.Data.DataTable("Collecting");
DataColumn dc1 = new DataColumn("Title", System.Type.GetType("System.String"));//标题
DataColumn dc2 = new DataColumn("Price", System.Type.GetType("System.String"));//参考价
DataColumn dc3 = new DataColumn("Contact", System.Type.GetType("System.String"));//联系方式
DataColumn dc4 = new DataColumn("Time", System.Type.GetType("System.String"));//单条采集开始时间
DataColumn dc5 = new DataColumn("Timer", System.Type.GetType("System.String"));//单条采集用时
DataColumn dc6 = new DataColumn("State", System.Type.GetType("System.String"));//采集状态
DataColumn dc7 = new DataColumn("Url", System.Type.GetType("System.String"));//所属url
dt.Columns.Add(dc1);
dt.Columns.Add(dc2);
dt.Columns.Add(dc3);
dt.Columns.Add(dc4);
dt.Columns.Add(dc5);
dt.Columns.Add(dc6);
dt.Columns.Add(dc7);
//读取txt文件的内容
string txtFile = Server.MapPath("~/History/SetRepeatFile.txt");
FileStream fs = new FileStream(txtFile, FileMode.Open, FileAccess.Read);
StreamReader sr = new StreamReader(fs);
string txtFileRead = "";
string txtStr = sr.ReadToEnd();
if (!string.IsNullOrEmpty(txtStr))
txtFileRead = txtStr;
sr.Close();
fs.Close();
string repeat = ""; //为不重复数据设置
count = titleList.Count;//记录全部数据的总数
for (int k = 0; k < count; k++)
{
//用于每条数据采集的用时
DateTime startTime = DateTime.Now;
try
{
string content = GetPage(titleList[k].ToString());//读取网页源代码
string xsState = Regex.Match(content, @"(?<=<span([\s\S]*)class=""pro_center_r""([\s\S]*)>([\s\S]*)销售状态:)(.[^<]*)").Value;
xsState = Regex.Replace(xsState, @"<font(.*)>([^<].+?)", "$2");
if (content == "" || content.IndexOf("该藏不存在") >= 0 || content.Length < 200 || xsState == "已售")
{
sb++;
continue;
}
else
{
DataRow dr = dt.NewRow();
string url = Regex.Match(content, @"(?<=\$\.getJSON\(\"")((?:http:\//usercenter.abc123.com/GetInfo.asp)(?:(?!\"").)*)(?=\"")").Groups[1].Value;
string content2 = GetPage(url);
string contact = Regex.Match(content2, @"(?<=Mobile\""\:(\""))((?:\d)(?:(?!\"").)*)(?=\"",)").Value;
Regex regex = new Regex("^1\\d{10}$");
if (
txtFileRead.IndexOf(contact) < 0 && //禁止已经采集过得号码
repeat.IndexOf(contact) < 0 && //禁止本次已经采集过得号码
!string.IsNullOrEmpty(contact.ToLower()) && //禁止号码为空
regex.IsMatch(contact)//禁止非号码
)
{
repeat += "," + contact;
}
else
{
cf++;
continue;
}
dr["Contact"] = contact;
string title = Regex.Match(content, @"(?<=<div([\s\S]*)id=""pro_title""([\s\S]*)>.*</span>).*(?=[^<])").Value;
title = Regex.Replace(title, @"([\s| |/g])", "");
dr["Title"] = Regex.Match(title, @"(.[^<])*").Value;
string price = Regex.Match(content, @"(?<=>参考价:([\s\S]*)<span([\s\S]*)class=""price"">)([\s\S]+?)(?=<\/span>)").Value;
price = Regex.Replace(price, @"<font(.*)>([^<].+?)</font>", "$2", RegexOptions.IgnoreCase);
price = Regex.Replace(price, @"([\s| |/g])", "");
dr["Price"] = price;
//设置价格大于10万或议价
string priceRen = "";
string priceRep = price.Replace("元", "");
if (Regex.IsMatch(priceRep, @"^\d*$"))
{
if (Convert.ToInt32(priceRep) >= 100000 || Convert.ToInt32(priceRep) <= 50)
priceRen = price;
else
priceRen = "";
}
else if (priceRep == "议价")
priceRen = price;
else
priceRen = "";
dr["Time"] = DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss:ffff");
TimeSpan ts = DateTime.Now - DateTime.Now;
dr["Timer"] = ts.Seconds.ToString() + "." + ts.Milliseconds.ToString() + "ms";
dr["State"] = "成功";
dr["Url"] = titleList[k].ToString();
if (priceRen != "")//禁止价格小于10万或非议价
{
dt.Rows.Add(dr);
cg++;
}
else
sb++;
}
}
catch (Exception ex)
{
DAL.ErrorLog.WriteLog(ex);//记录异常信息
yc++;
}
}
if (dt.Rows.Count > 0)
{
//创建Excel文件存放数据
CreateExcelFile(dt);
//写入xml文件作为历史记录
string cTime = timer.ToString();//此次采集时间
string c_count = titleList.Count.ToString();//采集总数
TimeSpan tss = DateTime.Now - timer;
string c_timer = tss.Minutes.ToString() + "." + tss.Seconds.ToString() + "s";
string c_state = "<span>成功" + cg + "</span><span class=\"re\">过滤" + cf + "</span><span class=\"fail\">失败" + sb + "</span><span class=\"ero\">异常" + yc + "</span>";
SetXmlFile(FileName, cTime, c_count, c_timer, c_state);
//把联系方式写入到txt文档(主要是不采集已经采集过的号码数据)
StreamWriter sw = new StreamWriter(txtFile, true);
sw.WriteLine("\r" + DateTime.Now.ToString("yyyy年MM月dd日 HH:mm:ss") + " 采集\r");
sw.WriteLine(repeat.Substring(1));
sw.WriteLine("\n");
sw.Close();
//绑定数据
this.repCollectingList.DataSource = dt;
this.repCollectingList.DataBind();
}
else
Record.InnerHtml = "<p style=\"color:red; text-align:center;\">请检查采集设置,此次没有采集到任何数据哦!</p>";
/// <summary>
/// 获取页面的源代码
/// </summary>
/// <param name="url">定义的url页面</param>
/// <returns></returns>
private static string GetPage(string url)
{
string content = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506; .NET CLR 3.5.21022; .NET CLR 1.0.3705; .NET CLR 1.1.4322)";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream hwStream = response.GetResponseStream();
Encoding eData = Encoding.Default; //页面都是gbk编码
StreamReader sData = new StreamReader(hwStream, eData);
content = sData.ReadToEnd();
sData.Close();
}
catch (Exception)
{
content = "";
}
return content;
}
List<string> urls = new List<string>();
int maxTasks = 10;//这并不代表实际就开了10个线程,实际开多少个是由你的ThreadPool.SetMaxThreads,ThreadPool.SetMinThreads决定的
int splitLength = urls.Count / maxTasks;
if (maxTasks % maxTasks != 0)
{
splitLength++;
}
Parallel.For(0, maxTasks, i =>
{
for (int j = 0; j < splitLength; j++)
{
var idx = j + i * splitLength;
if (idx >= urls.Count)
{
break;
}
var url = urls[idx];
//请求url
}
});
这个是用Parallel做的例子