110,534
社区成员
发帖
与我相关
我的任务
分享
void GrabPaperByVIPWebClient(object o)
{
VIP.Model.Phase phase = o as VIP.Model.Phase;
if (phase == null)
return;
VIPWebClient wc = new VIPWebClient();
if (IsUrl(phase.Url))
{
if (!grabbing)
return;
grabjournalcount++;
this.BeginInvoke(new EventHandler(SetStatus), "访问页面 - " + grabjournalcount + "/" + journalcount);
string strResponse = "";
try
{
byte[] bResponse = wc.DownloadData(phase.Url);
strResponse = Encoding.UTF8.GetString(bResponse);
//HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
//document.LoadHtml(strResponse);
IHTMLDocument2 document = new HTMLDocumentClass();
document.designMode = "on";
document.write(strResponse);
document.close();
GetPaperData(document, "抓取文章", phase.ID.ToString());
document = null;
}
catch (WebException webEx)
{
Add2Log("错误", "从地址(" + phase.Url + ")抓取时发生异常,详细信息:" + webEx.Message);
if (webEx.Message.Contains("超时") || webEx.Message.Contains("403"))
{
overtimephaseList.Add(phase);
Add2Log("信息", "从地址(" + phase.Url + ")抓取时发生错误,将该任务加入重试队列!");
}
}
catch (System.AccessViolationException avEx)
{
Add2Log("错误", "将从地址(" + phase.Url + ")抓取的数据传入mshtml时发生异常,详细信息:" + avEx.Message);
}
}
else
{
Add2Log("警告", phase.Journal.JournalName + phase.PhaseString + " 地址(" + phase.Url + ")不正确,无法进行抓取操作!");
}
this.BeginInvoke(new EventHandler(PaperThreaExit), null);
}
void GetPaperData(IHTMLDocument2 document, string type, string phaseid)
{
if (document == null)
return;
VIP.BLL.Paper pp = new VIP.BLL.Paper();
List<VIP.Model.Paper> papers = pp.GetPaperList(phaseid);
VIPWebClient wc = new VIPWebClient();
VIP.Model.Config tag = GetConfig("期刊抓取", "文章列表容器tag");
VIP.Model.Config attr = GetConfig("期刊抓取", "文章列表容器识别参数及值");
VIP.Model.Config wtag = GetConfig("期刊抓取", "文章容器tag");
VIP.Model.Config ttag = GetConfig("期刊抓取", "文章标题容器tag");
VIP.Model.Config tattr = GetConfig("期刊抓取", "文章标题容器识别参数及值");
VIP.Model.Config atag = GetConfig("期刊抓取", "文章作者容器tag");
VIP.Model.Config aattr = GetConfig("期刊抓取", "文章作者容器识别参数及值");
VIP.Model.Config xpath = GetConfig("期刊抓取", "文章容器XPath");
VIP.Model.Config txpath = GetConfig("期刊抓取", "文章标题容器XPath");
VIP.Model.Config axpath = GetConfig("期刊抓取", "文章作者容器XPath");
//VIP.Model.Config qattr = GetConfig("期刊抓取", "期刊期次容器识别参数及值");
if (axpath != null && !String.IsNullOrEmpty(axpath.ConfigValue) && txpath != null && !String.IsNullOrEmpty(txpath.ConfigValue) && xpath != null && !String.IsNullOrEmpty(xpath.ConfigValue) && tag != null && !String.IsNullOrEmpty(tag.ConfigValue) && attr != null && !String.IsNullOrEmpty(attr.ConfigValue) && wtag != null && !String.IsNullOrEmpty(wtag.ConfigValue) && ttag != null && !String.IsNullOrEmpty(ttag.ConfigValue) && tattr != null && !String.IsNullOrEmpty(tattr.ConfigValue) && atag != null && !String.IsNullOrEmpty(atag.ConfigValue) && aattr != null && !String.IsNullOrEmpty(aattr.ConfigValue))
{
string[] attri = attr.ConfigValue.Replace("class", "className").Split(new char[] { ';', ';' }, StringSplitOptions.RemoveEmptyEntries);
string[] tattri = tattr.ConfigValue.Replace("class", "className").Split(new char[] { ';', ';' }, StringSplitOptions.RemoveEmptyEntries);
string[] aattri = aattr.ConfigValue.Replace("class","className").Split(new char[] { ';', ';' }, StringSplitOptions.RemoveEmptyEntries);
IHTMLElementCollection hec = ((IHTMLElement2)document.body).getElementsByTagName(tag.ConfigValue);
VIP.BLL.Phase p = new VIP.BLL.Phase();
VIP.Model.Phase phase = p.GetModel(Convert.ToInt32(phaseid));
if (phase == null)
return;
foreach (IHTMLElement he in hec)
{
bool target = true;
foreach (string at in attri)
{
string[] a = at.Split(new char[] { ':', ':' }, StringSplitOptions.RemoveEmptyEntries);
if (a.Length == 1)
{
target = target && he.innerText.Contains(a[0]);
}
else if (a.Length == 2)
{
object value = he.getAttribute(a[0]);
if (value == null)
value = "";
target = target && value.ToString() == a[1];
}
}
if (target)
{
IHTMLElementCollection papersList = ((IHTMLElement2)he).getElementsByTagName(wtag.ConfigValue);
foreach (IHTMLElement paper in papersList)
{
string authorstr = "";
IHTMLElementCollection authors = ((IHTMLElement2)paper).getElementsByTagName(atag.ConfigValue);
foreach (IHTMLElement author in authors)
{
bool atarget = true;
foreach (string at in aattri)
{
string[] a = at.Split(new char[] { ':', ':' }, StringSplitOptions.RemoveEmptyEntries);
if (a.Length == 1)
{
atarget = atarget && author.innerText.Contains(a[0]);
}
else if (a.Length == 2)
{
object value = author.getAttribute(a[0]);
if (value == null)
value = "";
atarget = atarget && value.ToString() == a[1];
}
}
if (atarget)
{
authorstr = author.innerText;
}
}
authors = null;
IHTMLElementCollection titles = ((IHTMLElement2)paper).getElementsByTagName(ttag.ConfigValue);
foreach (IHTMLElement title in titles)
{
bool ttarget = true;
foreach (string at in tattri)
{
string[] a = at.Split(new char[] { ':', ':' }, StringSplitOptions.RemoveEmptyEntries);
if (a.Length == 1)
{
ttarget = ttarget && title.innerText.Contains(a[0]);
}
else if (a.Length == 2)
{
object value = title.getAttribute(a[0]);
if (value == null)
value = "";
ttarget = ttarget && value.ToString() == a[1];
}
}
if (ttarget)
{
string url = title.getAttribute("href").ToString().Replace("about:", "");
url = checkUrl(url);
VIP.Model.Paper mp = null;
foreach (VIP.Model.Paper ppp in papers)
{
if (ppp.Url == url)
{
mp = ppp;
break;
}
}
if (mp == null)
mp = new VIP.Model.Paper();
string titlestr = title.innerText.Replace("'","");
if (mp.Title != titlestr || mp.JounalID != Convert.ToInt32(phaseid) || mp.Author != authorstr || String.IsNullOrEmpty(mp.KeyWords))
{
if (IsUrl(url) && String.IsNullOrEmpty(mp.KeyWords))
{
byte[] bResponse = wc.DownloadData(url);
string strResponse = Encoding.UTF8.GetString(bResponse);
HTMLDocumentClass doc = new HTMLDocumentClass();
doc.designMode = "on";
doc.IHTMLDocument2_write(strResponse);
doc.close();
mp = GetPaperDetailData(doc, mp);
doc = null;
}
mp.Title = titlestr;
mp.Url = url;
mp.JounalID = Convert.ToInt32(phaseid);
mp.Author = authorstr;
if (mp.ID == 0)
{
papers.Add(mp);
}
//mp = pp.Save(mp);
}
grabcount++;
this.BeginInvoke(new EventHandler(SetCount), grabcount.ToString());
}
}
titles = null;
}
}
}
pp.Save(papers);
hec = null;
}
}