C++爬虫崩溃问题

大炸B 2017-10-13 11:57:49
爬虫是爬取图片的
爬虫爬取百度可以爬一百多个网页,但是爬新浪第二个直接崩溃,堆栈调用指向tcpconnect::tcpclose()中的delet[] 或者closesocket()。delete[]还可以理解,可能是服务器返回的HTTP响应被我错误解析导致的越界操作。但是closesocket()崩溃就无法理解了。堆栈显示是在NTDLL中操作冲突了。
整个程序的大致想法是一个线程爬取网页,然后把网页html文档中的img标签和href项提取出来保存到url和img两个矢量中,然后另一个线程下载img矢量中的图片。socket使用的是非阻塞的。
engine.h
#pragma once
#include"tcp.h"
#include<vector>
#include<Windows.h>
#include<process.h>
using namespace std;
class mainclass
{
public:
vector<char*>url;
vector<char*>img;
mainclass(char* starturl);
~mainclass();
void gethtml();
void getimg();
tcpconnect* httpget;
tcpconnect* imgget;
bool threadflag;
bool urlthreadinf;
bool imgthreadinf;
void htmlanalyze(char* doc,int docsize);
void imganalyze(char* doc,int docsize);
protected:
static DWORD WINAPI urlFun(LPVOID a);
static DWORD WINAPI imgFun(LPVOID a);
private:
bool checkurl(char* check);
CRITICAL_SECTION* imgvectorlock;
};

tcp.h
#pragma once
#include"tcp.h"
#include<vector>
#include<Windows.h>
#include<process.h>
using namespace std;
class mainclass
{
public:
vector<char*>url;
vector<char*>img;
mainclass(char* starturl);
~mainclass();
void gethtml();
void getimg();
tcpconnect* httpget;
tcpconnect* imgget;
bool threadflag;
bool urlthreadinf;
bool imgthreadinf;
void htmlanalyze(char* doc,int docsize);
void imganalyze(char* doc,int docsize);
protected:
static DWORD WINAPI urlFun(LPVOID a);
static DWORD WINAPI imgFun(LPVOID a);
private:
bool checkurl(char* check);
CRITICAL_SECTION* imgvectorlock;
};

engine.cpp
#include "engine.h"
#include <fstream>
#include<Windows.h>
using namespace std;
char* getfilename(char*url)
{
int i = 0;
char* lp = url + strlen(url)-1;
bool flag = true;
for (; flag; lp--)
{
if (*lp == '/')
flag = false;
if (*lp == '\\')
flag = false;
if (*lp == '<')
flag = false;
if (*lp == '>')
flag = false;
if (*lp == '\"')
flag = false;
if (*lp == '?')
flag = false;
if (*lp == '*')
flag = false;
i++;
if (i > 40)
flag = false;
}
lp++;
return lp;
}
mainclass::mainclass(char * starturl)
{
url.push_back(starturl);
httpget = new tcpconnect();
imgget = new tcpconnect();
threadflag = true;
urlthreadinf = true;
imgvectorlock = new CRITICAL_SECTION();
InitializeCriticalSection(imgvectorlock);
CreateThread(NULL, 0, mainclass::urlFun, this, 0, NULL);
CreateThread(NULL, 0, mainclass::imgFun, this, 0, NULL);
}

mainclass::~mainclass()
{
delete httpget;
delete imgget;
DeleteCriticalSection(imgvectorlock);
}

void mainclass::gethtml()
{
int i = 0,len,size;
char* tempurl;
while (threadflag)
{
tempurl = url.at(i);
httpget->seturl(tempurl);
if (httpget->myconnect() == false)
{
i++;
httpget->tcpclose();
if (i > url.size())
{
urlthreadinf = false;
return;
}
continue;
}
len = httpget->makehttpgetmsg();
httpget->tcpsend(httpget->getmsg, len);
size = httpget->recvhtml();
if (size == -1)
{
i++;
httpget->tcpclose();
if (i > url.size())
{
urlthreadinf = false;
return;
}
continue;
}
htmlanalyze(httpget->buf,size);
imganalyze(httpget->buf,size);
i++;
cout << "urlcount:" << i << endl;
if (i >= url.size())
{
urlthreadinf = false;
httpget->tcpclose();
return;
}
httpget->tcpclose();
}
}

void mainclass::getimg()
{
while (threadflag)
{
int len, size;
char* tempurl;
while (threadflag)
{
EnterCriticalSection(imgvectorlock);
if(img.empty())
{
LeaveCriticalSection(imgvectorlock);
continue;
}
tempurl = img.at(0);
LeaveCriticalSection(imgvectorlock);
imgget->seturl(tempurl);
if (imgget->myconnect() == false)
{
EnterCriticalSection(imgvectorlock);
img.erase(img.begin());
LeaveCriticalSection(imgvectorlock);
imgget->tcpclose();
continue;
}
len = imgget->makehttpgetmsg();
imgget->tcpsend(imgget->getmsg, len);
size = imgget->recvhtml();
if (size == -1)
{
EnterCriticalSection(imgvectorlock);
img.erase(img.begin());
LeaveCriticalSection(imgvectorlock);
imgget->tcpclose();
continue;
}
if (strstr(imgget->head, "OK"))
{
int filelen = 8 + strlen(tempurl);
char* filename = new char[filelen];
memset(filename, 0, filelen);
strcat(filename, "D:\\img\\");
strcat(filename, getfilename(tempurl));
ofstream out(filename, ios::binary);
out.write(imgget->buf, size);
cout << "outimg:" << filename << endl;
out.close();
delete[]filename;
}
imgget->tcpclose();
EnterCriticalSection(imgvectorlock);
img.erase(img.begin());
LeaveCriticalSection(imgvectorlock);
}
}
}

void mainclass::htmlanalyze(char* doc, int docsize)
{
char* finish,*temp;
int size = 0;
char* start = strstr(doc, "href=\"")+6;
while (start != (char*)6)
{
if (start > (doc + docsize))
return;
finish = strstr(start, "\"");
if (finish == NULL)
return;
if (*start == '/')
{
size = finish - start + strlen(httpget->host) + 1;
temp = new char[size];
memset(temp, 0, size);
strcat(temp, httpget->host);
char* lptemp = temp + strlen(httpget->host);
for (int i = 0; i < finish - start; i++)
*(lptemp + i) = *(start + i);
}
else
{
temp = new char[finish - start + 1];
for (int i = 0; i < finish - start; i++)
*(temp + i) = *(start + i);
*(temp + (finish - start)) = 0;
}
if (!checkurl(temp))
{
url.push_back(temp);
cout <<"geturl:"<< temp << endl;
}
start = strstr(finish, "href=\"")+6;
}
}

void mainclass::imganalyze(char* doc, int docsize)
{
char* finish,*temp,*tempstart,*tempfinish;
int size;
char* start = strstr(doc, "<img") + 4;
while (start !=(char*)4 )
{
if (start > (doc + docsize))
return;
finish = strstr(start, "/>");
if (finish == NULL)
return;
char* lptemp = new char[finish - start + 1];
for (int i = 0; i < finish - start; i++)
*(lptemp + i) = *(start + i);
*(lptemp + (finish - start)) = 0;
tempstart = strstr(lptemp,"src=\"")+5;
if (tempstart == (char*)5)
{
start = strstr(finish, "<img") + 4;
continue;
}
tempfinish = strstr(tempstart, "\"");
if (*tempstart == '/')
{
size = tempfinish - tempstart + strlen(httpget->host) + 1;
temp = new char[size];
memset(temp, 0, size);
strcat(temp, httpget->host);
char* lpbuf= temp + strlen(httpget->host);
for (int i = 0; i < tempfinish - tempstart; i++)
*(lpbuf + i) = *(tempstart + i);
}
else
{
temp = new char[tempfinish - tempstart + 1];
for (int i = 0; i < tempfinish - tempstart; i++)
*(temp + i) = *(tempstart + i);
*(temp + (tempfinish - tempstart)) = 0;
}
delete[]lptemp;
EnterCriticalSection(imgvectorlock);
img.push_back(temp);
cout << "getimg:" << temp << endl;
LeaveCriticalSection(imgvectorlock);
start = strstr(finish, "<img") + 4;
}
}

DWORD mainclass::urlFun(LPVOID a)
{
mainclass* lp = (mainclass*)a;
lp->gethtml();
return 0;
}

DWORD mainclass::imgFun(LPVOID a)
{
mainclass* lp = (mainclass*)a;
lp->getimg();
return 0;
}

bool mainclass::checkurl(char * check)
{
for (int i = 0; i < url.size(); i++)
{
if (!strcmp(check, url.at(i)))
return true;
}
return false;
}

...全文
599 5 打赏 收藏 转发到动态 举报
写回复
用AI写文章
5 条回复
切换为时间正序
请发表友善的回复…
发表回复
大炸B 2017-12-03
  • 打赏
  • 举报
回复
引用 3 楼 bigfog的回复:
不明白 while (1); 这个干吗
因为爬取网页和爬取图片是两个线程。所以主线程必须挂起。不然程序会退出的
大炸B 2017-12-03
  • 打赏
  • 举报
回复
引用 2 楼 踏岸寻柳的回复:
既然都用C++Builder了,就别使用裸socket了,换成高层次的网络组件吧
只是想实际了解下socket和http协议。所以想试试能不能从底层开始写
bigfog 2017-11-20
  • 打赏
  • 举报
回复
不明白 while (1); 这个干吗
踏岸寻柳 2017-11-18
  • 打赏
  • 举报
回复
既然都用C++Builder了,就别使用裸socket了,换成高层次的网络组件吧
大炸B 2017-10-13
  • 打赏
  • 举报
回复
tcp.cpp
#include "tcp.h"
#include<ctime>
int getlen(char* buf)
{
	int i, j, num = 0;
	buf = strstr(buf, "Content-Length: ") + 16;
	if (buf == (char*)16)
		return -1;
	char* temp = strstr(buf, "\r\n");
	i = temp - buf;
	for (j = 0; j <i; j++)
	{
		if ((*(buf + j) - '0') < 0 || (*(buf + j) - '0') > 9)
			return -1;
		num += ((*(buf + j) - '0') * pow(10, (i - j - 1)));
	}
	return num;
}
int tcpconnect::inibuf(char* src, char* cpy, int len)
{
	int count, total,headlen;
	char* temp;
	temp = strstr(cpy, "\r\n\r\n") + 4;
	if (temp == (char*)4 ||temp>(cpy+len))
		return -1;
	headlen = temp - cpy+1;
	head = new char[headlen];
	memset(head, 0, headlen);
	for (count = 0; count < headlen-1; count++)
		*(head+count) = *(cpy+count);
	total = len - (temp - cpy);
	for (count = 0; count < total; count++)
		*(src + count) = *(temp + count);
	return total;
}

tcpconnect::tcpconnect()
{
	WSADATA wsaData;
	WSAStartup(MAKEWORD(2, 2), &wsaData);
	tcpsocket= socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
	int iMode = 1;
	ioctlsocket(tcpsocket, FIONBIO,(u_long FAR*)&iMode);
	myaddr.sin_family = AF_INET;
	myaddr.sin_port = htons(80);
}

bool tcpconnect::myconnect()
{
	int i=0,j=0;
	clock_t start, finish;
	struct hostent *hptr = gethostbyname(host);
	if (hptr == NULL)
		return false;
	myaddr.sin_addr.S_un.S_addr = inet_addr(inet_ntoa((*(struct in_addr*)hptr->h_addr)));
	start = clock();
	do {
		connect(tcpsocket, (sockaddr *)&myaddr, sizeof(myaddr));
		j = WSAGetLastError();
		finish = clock();
		if (finish - start > 2000)
			return false;
	} while (j !=10056);
	return true;
}

int tcpconnect::tcpsend(char* sendbuf, int buflen)
{
	int i;
	do {
		i = send(tcpsocket, sendbuf, buflen, 0);
	} while (i != buflen);
	return i;
}

int tcpconnect::tcprecv(char* buf,int buflen)
{
	int i = recv(tcpsocket, buf, buflen, 0);
	if (i <0)
		return 0;
	else
	    return i;
}

void tcpconnect::tcpclose()
{
	closesocket(tcpsocket);
	tcpsocket = INVALID_SOCKET;
	tcpsocket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
	int iMode = 1;
	ioctlsocket(tcpsocket, FIONBIO, (u_long FAR*)&iMode);
	delete[]host,host=NULL;
	delete[]head,head=NULL;
	delete[]getmsg,getmsg=NULL;
	delete[]resource,resource=NULL;
	delete[]buf,buf=NULL;
}

int tcpconnect::makehttpgetmsg()
{
	int msglen= 25 + strlen(host) + strlen(resource) + 1;
	getmsg = new char[msglen];
	memset(getmsg, 0, msglen);
	strcat(getmsg, "GET ");
	strcat(getmsg, resource);
	strcat(getmsg, " HTTP/1.1\r\nHost: ");
	strcat(getmsg, host);
	strcat(getmsg, "\r\n\r\n");
	return msglen;
}

int tcpconnect::recvhtml()
{
	clock_t start, finish;
	int count=0, total, j;
	char*tempbuf;
	tempbuf = new char[1001];
	memset(tempbuf, 0, 1001);
	start = clock();
	do
	{
		j = tcprecv(tempbuf+count, 1000-count);
		count += j;
		finish = clock();
		if (finish - start > 2000)
		{
			break;
		}
	} while (count <1000);
	total = getlen(tempbuf);
	if (total == -1)
		return -1;
	buf = new char[total];
	memset(buf, 0, total);
	count = inibuf(buf, tempbuf, count);
	delete[]tempbuf;
	if (count > total || count < 0)
		return -1;
	start = clock();
	j = 0;
	do
	{
		j =tcprecv(buf + count, total - count);
		count += j;
		if(j!=0)
			start = clock();
		finish = clock();
		if (finish - start > 10000)
			return -1;
	}while (count < total);
	bufsize = total;
	return total;
}

tcpconnect::~tcpconnect()
{
	closesocket(tcpsocket);
	WSACleanup();
}

void tcpconnect::seturl(char * temp)
{
	if (strstr(temp, "http://") != 0)
		temp = temp + 7;
	if (strstr(temp, "https://") != 0)
		temp = temp + 8;
	char* srcpos = strstr(temp, "/");
	if (srcpos == NULL)
	{
		resource = new char[2];
		memset(resource,0,2);
		strcpy(resource, "/");
		host = new char[strlen(temp) + 1];
		memset(host, 0, strlen(temp) + 1);
		strcpy(host, temp);
	}
	else
	{
		resource = new char[strlen(srcpos)+1];
		memset(resource, 0, strlen(srcpos) + 1);
		strcpy(resource, srcpos);
		host = new char[srcpos-temp+ 1];
		memset(host, 0, srcpos - temp + 1);
		for (int i = 0; i < srcpos - temp; i++)
			host[i] = temp[i];
	}
}
testmain.cpp
#include"engine.h"
extern char* getfilename(char* url);
void main()
{
	mainclass* engine = new mainclass("www.baidu.com");
	while (1);
    
}

1,316

社区成员

发帖
与我相关
我的任务
社区描述
C++ Builder 网络及通讯开发
社区管理员
  • 网络及通讯开发社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧