在多线程中实现抓取网页,出错了,问题貌似出在对公共变量的操作,但是弄了几天了都搞不定,希望哪位高手帮忙看下。。本人菜鸟,程序写得可能有些不规范,高手莫怪。
DWORD WINAPI ThreadProc(LPVOID pParam)
{
int index=*(int*)pParam;
delete pParam;
_ConnectionPtr m_Pcon;
_RecordsetPtr m_Precord;
FILE* fp;
CInternetSession mySession(NULL,0);
CHttpFile* myHttpFile=NULL;
CString straddress,strsource,strline,filepath,fileaddress,strsql;
int num;
CMainFrame *pw=(CMainFrame *)AfxGetMainWnd();
CDialogBar *pb=&(pw->m_DlgToolBar);
pb->GetDlgItemText(IDC_EDADD,fileaddress);
filepath.Format("%s\\test.htm",fileaddress);
boost::regex expression("((http://|bbs\\.|www\\.|blog\\.)([^/]+)?(?:\\.com|\\.org|\\.edu|\\.gov|\\.biz|\\.net|\\.int|\\.info|\\.ous|\\.[a-z][a-z])(:(\\d+))?(/([^/]+)?)*/([^/]+)?(\\.htm|\\.html))|((http://|bbs\\.|www\\.|blog\\.)([^/]+)?(?:\\.com|\\.org|\\.edu|\\.gov|\\.biz|\\.net|\\.int|\\.info|\\.ous|\\.[a-z][a-z])(:(\\d+))?)");
CString sRet;
boost::cmatch what;
m_Pcon.CreateInstance("ADODB.Connection");
m_Precord.CreateInstance("ADODB.recordset");
m_Pcon->ConnectionString = "Provider=Microsoft.Jet.OLEDB.4.0; \
Data Source=./testlist.mdb;Persist Security Info=False";
try
{
m_Pcon->Open("","","",-1);
}
catch(...)
{
AfxMessageBox("数据库连接错误",0,0);
}
// while(m_calculater<=list_calculater)
// {
while(m_critical==true)
{
// if(m_critical==true) SuspendThread(m_pHthread[index]);
}
m_critical==true; //锁住
strsql.Format("select * from list1 where NUM=%d",m_calculater); //天呐,原来要这样写!
m_Precord->Open((_variant_t)strsql,m_Pcon.GetInterfacePtr(),adOpenKeyset,adLockOptimistic,adCmdText);
//读取数据表中URL
straddress=(char*)(_bstr_t)m_Precord->GetCollect("URL");
m_calculater++;
myHttpFile=(CHttpFile*)mySession.OpenURL(straddress);
while(myHttpFile->ReadString(strline))
{
strsource+=strline;
strsource+="\r\n";
if(boost::regex_search(LPCSTR(strline), what, expression)) //CString转string
{
sRet = (what.str()).c_str();
// AfxMessageBox(sRet,0,0);
try
{
m_Precord->Open("select * from list1 where [URL]='"+(_bstr_t)sRet+"'",m_Pcon.GetInterfacePtr(),adOpenKeyset,adLockOptimistic,adCmdText);
num=m_Precord->GetRecordCount();
//避免插入重复的URL地址
if(num<1)
m_Pcon->Execute("insert into list1 values('"+ (_bstr_t)sRet +"','"+ (_bstr_t)list_calculater +"')",NULL,adCmdText);
list_calculater++;
}
catch(...)
{
continue;
return 0;
}
}
}
fp=fopen(filepath,"a");
fprintf(fp,"%s",strsource);
fclose(fp);
m_critical==false; //解锁
// }
myHttpFile->Close;
mySession.Close;
return 0;
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void CSpiderForGView::OnGodownload()
{
_ConnectionPtr m_Pcon;
_RecordsetPtr m_Precord;
FILE* fp;
CString straddress,fileaddress;
int num;
//正则表达所有HTM或HTML或无后缀的URL(网上找到的正则表达式都有问题,只能自己写了个这么烂的,莫怪。。)
boost::regex expression("((http://|bbs\\.|www\\.|blog\\.)([^/]+)?(?:\\.com|\\.org|\\.edu|\\.gov|\\.biz|\\.net|\\.int|\\.info|\\.ous|\\.[a-z][a-z])(:(\\d+))?(/([^/]+)?)*/([^/]+)?(\\.htm|\\.html))|((http://|bbs\\.|www\\.|blog\\.)([^/]+)?(?:\\.com|\\.org|\\.edu|\\.gov|\\.biz|\\.net|\\.int|\\.info|\\.ous|\\.[a-z][a-z])(:(\\d+))?)");
CString sRet;
boost::cmatch what;
CMainFrame *pw=(CMainFrame *)AfxGetMainWnd();
CDialogBar *pb=&(pw->m_DlgToolBar);
pb->GetDlgItemText(IDC_EDADD,fileaddress);
pb->GetDlgItemText(IDC_EDADDRESS,straddress);
pb->GetDlgItem(IDC_GODOWNLOAD)->EnableWindow(FALSE);
CInternetSession mySession(NULL,0);
CHttpFile* myHttpFile=NULL;
CString strsource,strline;
myHttpFile=(CHttpFile*)mySession.OpenURL(straddress);
while(myHttpFile->ReadString(strline))
{
strsource+=strline;
strsource+="\r\n";
if(boost::regex_search(LPCSTR(strline), what, expression)) //CString转string
{
sRet = (what.str()).c_str();
//向字符串数组中添加成员(与将下标加一再进行赋值效果相同)
//Addstore.Add(sRet);
//MessageBox(sRet);
//使用ADO连接Access数据源
m_Pcon.CreateInstance("ADODB.Connection");
m_Precord.CreateInstance("ADODB.recordset");
m_Pcon->ConnectionString = "Provider=Microsoft.Jet.OLEDB.4.0; \
Data Source=./testlist.mdb;Persist Security Info=False";
try
{
m_Pcon->Open("","","",-1);
}
catch(...)
{
AfxMessageBox("数据库连接错误",0,0);
}
try
{
m_Precord->Open("select * from list1 where URL='"+(_bstr_t)sRet+"'",m_Pcon.GetInterfacePtr(),adOpenKeyset,adLockOptimistic,adCmdText);
num=m_Precord->GetRecordCount();
//避免插入重复的URL地址
if(num<1)
//两种方式插入数据记录,任选其一:
m_Pcon->Execute("insert into list1 values('"+ (_bstr_t)sRet +"','"+ (_bstr_t)list_calculater +"')",NULL,adCmdText);
list_calculater++;
/************************************************************
m_Precord->AddNew();//添加新行
m_Precord->GetFields()->GetItem("URL")->Value = (_bstr_t)sRet;
m_Precord->GetFields()->GetItem("URL")->Value = list_calculater;
m_Precord->Update();
*************************************************************/
}
catch(...)
{
/********************************************************************************
关于此处使用continue关键字的解释说明以及提问:
1.若不使用try机制,编译无错误,运行时却出现"Runtime Error";将sRet变量换成具体的字符串就能运行成功;
估计是内存溢出,有待解答。
2.之前在continue位置上使用了break关键字,程序能够“坚持”的时间较之前久一点,不过还是出现运行中断。
3.使用continue的解释是若出现错误便*忽略*或者*重试*;而是事实上这样使得程序能够正常运行,
但理论上仍然需要进一步完善。
***********************************************************************************/
continue;
MessageBox("操作失败");
return;
}
}
}
//转换文件存储路径
filepath.Format("%s\\test.htm",fileaddress);
pb->SetDlgItemText(IDC_EDADD,filepath);
fp=fopen(filepath,"a");
fprintf(fp,"%s",strsource);
fclose(fp);
m_calculater++;
myHttpFile->Close ;
mySession.Close ;
//多线程
for(int i=0;i<m_ThreadCount;i++)
{
int* temp= new int;
*temp=i;
m_pHthread[i]=CreateThread(NULL,0,ThreadProc,temp,0,NULL);
}
}