提取网页中的所有的链结？

babam 2004-06-30 05:46:01

有很多的链结都是由脚本来生成的。这个有现成的解决方案吗？？
还要用多线程。开始愁了。
谢谢了！

...全文

178 15 打赏收藏转发到动态举报

写回复

用AI写文章

15 条回复

切换为时间正序

请发表友善的回复…

发表回复

babam 2004-07-02

打赏
举报

大哥你把所有主机地址后面的字符都删除了，可以不删除吗？
删除的话有很多链结都取得不对啊！

babam 2004-07-02

打赏
举报

查了一下，大哥是牛人谢谢了

babam 2004-07-02

打赏
举报

相对连接是什么？？我是菜鸟

yst_killed 2004-07-01

打赏
举报

下载时候你可以用多线程执行 GetHtml(）这个函数
当然了，你需要一个不停的输入URL的程序，这个估计你自己能写吧！

以上程序取道的URL都存在一个文本文件里了，url.txt

yst_killed 2004-07-01

打赏
举报

靠，贴出来怎么这样，你调整一下吧

对了，你还需要
再 StdAfx.h里添加

#pragma warning(disable : 4192)
#pragma warning(disable : 4146)
#import <mshtml.tlb> // Internet Explorer 5
#import <shdocvw.dll>

yst_killed 2004-07-01

打赏
举报

这个东西我做了一个
最好的办法就是下载下这个网页
打开他，查询里面的<href>这个标志
取出全部链接~

BOOL CDghttpDlg::GetHtml(CString SavePath, CString UrlStr, CString UrlName)
{
CInternetSession session;
CInternetFile* file = NULL;
try
{
// 试着连接到指定URL
file = (CInternetFile*) session.OpenURL(UrlStr);
}
catch(CInternetException* m_pException)
{
// 如果有错误的话，置文件为空
file = NULL;
m_pException->Delete();
return FALSE;
}

// 用dataStore来保存读取的网页文件
CStdioFile dataStore;
if (file)
{
CString somecode; //也可采用LPTSTR类型，将不会删除文本中的\n回车符
CString UrlNamePath;
UrlNamePath.Format("%s%s",SavePath,UrlName);
BOOL bIsOk = dataStore.Open(UrlNamePath,CFile::modeCreate | CFile::modeWrite | CFile::shareDenyWrite | CFile::typeText);
if (!bIsOk)
return FALSE;
// 读写网页文件，直到为空
while (file->ReadString(somecode) != NULL) //如果采用LPTSTR类型，读取最大个数nMax置0，使它遇空字符时结束
{
dataStore.WriteString(somecode);
dataStore.WriteString("\n"); //如果somecode采用LPTSTR类型,可不用此句
}

file->Close();
delete file;
}
else
{
dataStore.WriteString(_T("到指定服务器的连接建立失败..."));
return FALSE;
}

return TRUE;
}

取网页的

接下来是去链接的

void CDghttpDlg::GetAllUrl(CString strPath, CString FileName)
{
// UpdateData(true);
CFile file;
CFileException e;
CString url;
CString str;
str.Format("%s%s", strPath, FileName);
if(file.Open(str,CFile::modeReadWrite,&e))
{
// DWORD TCount;
DWORD byte = file.GetLength();
// m_pro.SetRange(0,byte);
while(byte)
{

char buf[100];
UINT count = file.Read(buf,100);
byte -=count; //因为网页类内容很多所以分块取,count为每次取得字符数量,byte为文件总字符量
for(int i=3;i<100;i++)
{
//判断m_url是不是等于url链接标志符href如果是开始采集url
CString m_url1;
m_url1.Format("%c%c%c%c",buf[i-3],buf[i-2],buf[i-1],buf[i]);
if(m_url1 == "href")
{
url="";
int jcount=0;
if(buf[i+2]=='"')
{//判断是不是"
jcount = i+3;
CString gettemphttp;
gettemphttp.Format("%c%c%c%c%c%c%c",buf[jcount],buf[jcount+1],buf[jcount+2],buf[jcount+3],buf[jcount+4],buf[jcount+5],buf[jcount+6]);
if(gettemphttp!="http://")
continue;
}
else
{
jcount = i+2;
CString gettemphttp1;
gettemphttp1.Format("%c%c%c%c%c%c%c",buf[jcount],buf[jcount+1],buf[jcount+2],buf[jcount+3],buf[jcount+4],buf[jcount+5],buf[jcount+6]);
if(gettemphttp1!="http://")
continue;
}
url+="http://";
for(int j=jcount+7;j<100;j++)
{

if(buf[j]==' '||buf[j]=='>'||buf[j]=='"'||buf[j]=='/')
{
//将取道的url存入文件中并换行
CFile writefile;
CFileException e1;
str.Format("%s%s",strPath,"url.txt");
if(writefile.Open(str,CFile::modeReadWrite,&e))
{
url+="\r\n";
writefile.Seek(0L,CFile::end);
char * urlchar =(LPSTR)(LPCTSTR)url;
writefile.Write(urlchar,url.GetLength());
writefile.Close();
//AfxMessageBox(_T("写文件成功!"));
}
else
{
afxDump << "File could not be opened"<<e1.m_cause<<"\n";
}//结束
i=j;
break;
}
url = url+buf[j];

}
}
}

}
file.Close();
}
else
{
afxDump << "File could not be opened"<<e.m_cause<<"\n";
}
}