多线程用mshtml解析html时,内存暴涨,程序中断,如何处理?

chentank 2016-02-21 05:32:26
在做一个抓取工具,使用htmlagilitypack 解析时不会出现该错误,换用mshtml后开了多线程内存占用很快就跳到了1G+,随后就程序中断,是不是我在用mshtml解析html后需要主动释放什么资源,求指点

void GrabPaperByVIPWebClient(object o)
{
VIP.Model.Phase phase = o as VIP.Model.Phase;
if (phase == null)
return;
VIPWebClient wc = new VIPWebClient();
if (IsUrl(phase.Url))
{
if (!grabbing)
return;
grabjournalcount++;
this.BeginInvoke(new EventHandler(SetStatus), "访问页面 - " + grabjournalcount + "/" + journalcount);
string strResponse = "";
try
{
byte[] bResponse = wc.DownloadData(phase.Url);
strResponse = Encoding.UTF8.GetString(bResponse);
//HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
//document.LoadHtml(strResponse);


IHTMLDocument2 document = new HTMLDocumentClass();
document.designMode = "on";
document.write(strResponse);
document.close();
GetPaperData(document, "抓取文章", phase.ID.ToString());
document = null;
}
catch (WebException webEx)
{
Add2Log("错误", "从地址(" + phase.Url + ")抓取时发生异常,详细信息:" + webEx.Message);
if (webEx.Message.Contains("超时") || webEx.Message.Contains("403"))
{
overtimephaseList.Add(phase);
Add2Log("信息", "从地址(" + phase.Url + ")抓取时发生错误,将该任务加入重试队列!");
}

}
catch (System.AccessViolationException avEx)
{
Add2Log("错误", "将从地址(" + phase.Url + ")抓取的数据传入mshtml时发生异常,详细信息:" + avEx.Message);
}
}
else
{
Add2Log("警告", phase.Journal.JournalName + phase.PhaseString + " 地址(" + phase.Url + ")不正确,无法进行抓取操作!");
}
this.BeginInvoke(new EventHandler(PaperThreaExit), null);
}


...全文
197 6 打赏 收藏 转发到动态 举报
写回复
用AI写文章
6 条回复
切换为时间正序
请发表友善的回复…
发表回复
泡泡龙 2016-02-24
  • 打赏
  • 举报
回复
mshtml是线程安全的吗?
WM_JAWIN 2016-02-24
  • 打赏
  • 举报
回复
偷别人的数据,还不如直接解释字符串快
puler 2016-02-22
  • 打赏
  • 举报
回复
一直的想法就是,直接用开源组件获取HTML源代码(如遇AJAX,就用时间以及模拟鼠标键盘的动作,获取完整的HTML代码), 1、假设这个HTML代码是在某个周期内不会进行大幅变动 2、然后对某种标签,统计他的个数,计算位置,然后获取标签内的值
蒋晟 2016-02-21
  • 打赏
  • 举报
回复
Marshal.ReleaseComObject
chentank 2016-02-21
  • 打赏
  • 举报
回复
 void GetPaperData(IHTMLDocument2 document, string type, string phaseid)
        {          
            if (document == null)
                return;
            VIP.BLL.Paper pp = new VIP.BLL.Paper();
            List<VIP.Model.Paper> papers = pp.GetPaperList(phaseid);
            VIPWebClient wc = new VIPWebClient();
            VIP.Model.Config tag = GetConfig("期刊抓取", "文章列表容器tag");
            VIP.Model.Config attr = GetConfig("期刊抓取", "文章列表容器识别参数及值");
            VIP.Model.Config wtag = GetConfig("期刊抓取", "文章容器tag");
            VIP.Model.Config ttag = GetConfig("期刊抓取", "文章标题容器tag");
            VIP.Model.Config tattr = GetConfig("期刊抓取", "文章标题容器识别参数及值");
            VIP.Model.Config atag = GetConfig("期刊抓取", "文章作者容器tag");
            VIP.Model.Config aattr = GetConfig("期刊抓取", "文章作者容器识别参数及值");
            VIP.Model.Config xpath = GetConfig("期刊抓取", "文章容器XPath");
            VIP.Model.Config txpath = GetConfig("期刊抓取", "文章标题容器XPath");
            VIP.Model.Config axpath = GetConfig("期刊抓取", "文章作者容器XPath");
            //VIP.Model.Config qattr = GetConfig("期刊抓取", "期刊期次容器识别参数及值");
            if (axpath != null && !String.IsNullOrEmpty(axpath.ConfigValue) && txpath != null && !String.IsNullOrEmpty(txpath.ConfigValue) && xpath != null && !String.IsNullOrEmpty(xpath.ConfigValue) && tag != null && !String.IsNullOrEmpty(tag.ConfigValue) && attr != null && !String.IsNullOrEmpty(attr.ConfigValue) && wtag != null && !String.IsNullOrEmpty(wtag.ConfigValue) && ttag != null && !String.IsNullOrEmpty(ttag.ConfigValue) && tattr != null && !String.IsNullOrEmpty(tattr.ConfigValue) && atag != null && !String.IsNullOrEmpty(atag.ConfigValue) && aattr != null && !String.IsNullOrEmpty(aattr.ConfigValue))
            {
                string[] attri = attr.ConfigValue.Replace("class", "className").Split(new char[] { ';', ';' }, StringSplitOptions.RemoveEmptyEntries);
                string[] tattri = tattr.ConfigValue.Replace("class", "className").Split(new char[] { ';', ';' }, StringSplitOptions.RemoveEmptyEntries);
                string[] aattri = aattr.ConfigValue.Replace("class","className").Split(new char[] { ';', ';' }, StringSplitOptions.RemoveEmptyEntries);
               
                    IHTMLElementCollection hec = ((IHTMLElement2)document.body).getElementsByTagName(tag.ConfigValue);
                    VIP.BLL.Phase p = new VIP.BLL.Phase();
                    VIP.Model.Phase phase = p.GetModel(Convert.ToInt32(phaseid));
                    if (phase == null)
                        return;

                    foreach (IHTMLElement he in hec)
                    {
                        bool target = true;
                        foreach (string at in attri)
                        {
                            string[] a = at.Split(new char[] { ':', ':' }, StringSplitOptions.RemoveEmptyEntries);
                            if (a.Length == 1)
                            {
                                target = target && he.innerText.Contains(a[0]);
                            }
                            else if (a.Length == 2)
                            {
                                object value = he.getAttribute(a[0]);
                                if (value == null)
                                    value = "";
                                target = target && value.ToString() == a[1];
                            }
                        }
                        if (target)
                        {

                            IHTMLElementCollection papersList = ((IHTMLElement2)he).getElementsByTagName(wtag.ConfigValue);
                            foreach (IHTMLElement paper in papersList)
                            {
                                string authorstr = "";
                                IHTMLElementCollection authors = ((IHTMLElement2)paper).getElementsByTagName(atag.ConfigValue);
                                foreach (IHTMLElement author in authors)
                                {
                                    bool atarget = true;
                                    foreach (string at in aattri)
                                    {
                                        string[] a = at.Split(new char[] { ':', ':' }, StringSplitOptions.RemoveEmptyEntries);
                                        if (a.Length == 1)
                                        {
                                            atarget = atarget && author.innerText.Contains(a[0]);
                                        }
                                        else if (a.Length == 2)
                                        {
                                            object value = author.getAttribute(a[0]);
                                            if (value == null)
                                                value = "";
                                            atarget = atarget && value.ToString() == a[1];
                                        }
                                    }
                                    if (atarget)
                                    {
                                        authorstr = author.innerText;
                                    }
                                }
                                authors = null;
                                IHTMLElementCollection titles = ((IHTMLElement2)paper).getElementsByTagName(ttag.ConfigValue);
                                foreach (IHTMLElement title in titles)
                                {
                                    bool ttarget = true;
                                    foreach (string at in tattri)
                                    {
                                        string[] a = at.Split(new char[] { ':', ':' }, StringSplitOptions.RemoveEmptyEntries);
                                        if (a.Length == 1)
                                        {
                                            ttarget = ttarget && title.innerText.Contains(a[0]);
                                        }
                                        else if (a.Length == 2)
                                        {
                                            object value = title.getAttribute(a[0]);
                                            if (value == null)
                                                value = "";
                                            ttarget = ttarget && value.ToString() == a[1];
                                        }
                                    }
                                    if (ttarget)
                                    {
                                        string url = title.getAttribute("href").ToString().Replace("about:", "");
                                        url = checkUrl(url);
                                        VIP.Model.Paper mp = null;
                                        foreach (VIP.Model.Paper ppp in papers)
                                        {
                                            if (ppp.Url == url)
                                            {
                                                mp = ppp;
                                                break;
                                            }
                                        }
                                        if (mp == null)
                                            mp = new VIP.Model.Paper();

                                        string titlestr = title.innerText.Replace("'","");
                                        if (mp.Title != titlestr || mp.JounalID != Convert.ToInt32(phaseid) || mp.Author != authorstr || String.IsNullOrEmpty(mp.KeyWords))
                                        {
                                            if (IsUrl(url) && String.IsNullOrEmpty(mp.KeyWords))
                                            {
                                                byte[] bResponse = wc.DownloadData(url);
                                                string strResponse = Encoding.UTF8.GetString(bResponse);
                                                HTMLDocumentClass doc = new HTMLDocumentClass();
                                                doc.designMode = "on";
                                                doc.IHTMLDocument2_write(strResponse);
                                                doc.close();
                                                mp = GetPaperDetailData(doc, mp);
                                                doc = null;
                                            }
                                            mp.Title = titlestr;
                                            mp.Url = url;
                                            mp.JounalID = Convert.ToInt32(phaseid);
                                            mp.Author = authorstr;
                                            if (mp.ID == 0)
                                            {
                                                papers.Add(mp);
                                            }
                                            //mp = pp.Save(mp);
                                           
                                        }
                                        grabcount++;
                                        this.BeginInvoke(new EventHandler(SetCount), grabcount.ToString());
                                    }
                                }
                                titles = null;
                            }
                        }
                    }
                    pp.Save(papers);
                    hec = null;
                }

        }

110,534

社区成员

发帖
与我相关
我的任务
社区描述
.NET技术 C#
社区管理员
  • C#
  • Web++
  • by_封爱
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告

让您成为最强悍的C#开发者

试试用AI创作助手写篇文章吧