1,316
社区成员
发帖
与我相关
我的任务
分享
#pragma once
#include"tcp.h"
#include<vector>
#include<Windows.h>
#include<process.h>
using namespace std;
class mainclass
{
public:
vector<char*>url;
vector<char*>img;
mainclass(char* starturl);
~mainclass();
void gethtml();
void getimg();
tcpconnect* httpget;
tcpconnect* imgget;
bool threadflag;
bool urlthreadinf;
bool imgthreadinf;
void htmlanalyze(char* doc,int docsize);
void imganalyze(char* doc,int docsize);
protected:
static DWORD WINAPI urlFun(LPVOID a);
static DWORD WINAPI imgFun(LPVOID a);
private:
bool checkurl(char* check);
CRITICAL_SECTION* imgvectorlock;
};
#pragma once
#include"tcp.h"
#include<vector>
#include<Windows.h>
#include<process.h>
using namespace std;
class mainclass
{
public:
vector<char*>url;
vector<char*>img;
mainclass(char* starturl);
~mainclass();
void gethtml();
void getimg();
tcpconnect* httpget;
tcpconnect* imgget;
bool threadflag;
bool urlthreadinf;
bool imgthreadinf;
void htmlanalyze(char* doc,int docsize);
void imganalyze(char* doc,int docsize);
protected:
static DWORD WINAPI urlFun(LPVOID a);
static DWORD WINAPI imgFun(LPVOID a);
private:
bool checkurl(char* check);
CRITICAL_SECTION* imgvectorlock;
};
#include "engine.h"
#include <fstream>
#include<Windows.h>
using namespace std;
char* getfilename(char*url)
{
int i = 0;
char* lp = url + strlen(url)-1;
bool flag = true;
for (; flag; lp--)
{
if (*lp == '/')
flag = false;
if (*lp == '\\')
flag = false;
if (*lp == '<')
flag = false;
if (*lp == '>')
flag = false;
if (*lp == '\"')
flag = false;
if (*lp == '?')
flag = false;
if (*lp == '*')
flag = false;
i++;
if (i > 40)
flag = false;
}
lp++;
return lp;
}
mainclass::mainclass(char * starturl)
{
url.push_back(starturl);
httpget = new tcpconnect();
imgget = new tcpconnect();
threadflag = true;
urlthreadinf = true;
imgvectorlock = new CRITICAL_SECTION();
InitializeCriticalSection(imgvectorlock);
CreateThread(NULL, 0, mainclass::urlFun, this, 0, NULL);
CreateThread(NULL, 0, mainclass::imgFun, this, 0, NULL);
}
mainclass::~mainclass()
{
delete httpget;
delete imgget;
DeleteCriticalSection(imgvectorlock);
}
void mainclass::gethtml()
{
int i = 0,len,size;
char* tempurl;
while (threadflag)
{
tempurl = url.at(i);
httpget->seturl(tempurl);
if (httpget->myconnect() == false)
{
i++;
httpget->tcpclose();
if (i > url.size())
{
urlthreadinf = false;
return;
}
continue;
}
len = httpget->makehttpgetmsg();
httpget->tcpsend(httpget->getmsg, len);
size = httpget->recvhtml();
if (size == -1)
{
i++;
httpget->tcpclose();
if (i > url.size())
{
urlthreadinf = false;
return;
}
continue;
}
htmlanalyze(httpget->buf,size);
imganalyze(httpget->buf,size);
i++;
cout << "urlcount:" << i << endl;
if (i >= url.size())
{
urlthreadinf = false;
httpget->tcpclose();
return;
}
httpget->tcpclose();
}
}
void mainclass::getimg()
{
while (threadflag)
{
int len, size;
char* tempurl;
while (threadflag)
{
EnterCriticalSection(imgvectorlock);
if(img.empty())
{
LeaveCriticalSection(imgvectorlock);
continue;
}
tempurl = img.at(0);
LeaveCriticalSection(imgvectorlock);
imgget->seturl(tempurl);
if (imgget->myconnect() == false)
{
EnterCriticalSection(imgvectorlock);
img.erase(img.begin());
LeaveCriticalSection(imgvectorlock);
imgget->tcpclose();
continue;
}
len = imgget->makehttpgetmsg();
imgget->tcpsend(imgget->getmsg, len);
size = imgget->recvhtml();
if (size == -1)
{
EnterCriticalSection(imgvectorlock);
img.erase(img.begin());
LeaveCriticalSection(imgvectorlock);
imgget->tcpclose();
continue;
}
if (strstr(imgget->head, "OK"))
{
int filelen = 8 + strlen(tempurl);
char* filename = new char[filelen];
memset(filename, 0, filelen);
strcat(filename, "D:\\img\\");
strcat(filename, getfilename(tempurl));
ofstream out(filename, ios::binary);
out.write(imgget->buf, size);
cout << "outimg:" << filename << endl;
out.close();
delete[]filename;
}
imgget->tcpclose();
EnterCriticalSection(imgvectorlock);
img.erase(img.begin());
LeaveCriticalSection(imgvectorlock);
}
}
}
void mainclass::htmlanalyze(char* doc, int docsize)
{
char* finish,*temp;
int size = 0;
char* start = strstr(doc, "href=\"")+6;
while (start != (char*)6)
{
if (start > (doc + docsize))
return;
finish = strstr(start, "\"");
if (finish == NULL)
return;
if (*start == '/')
{
size = finish - start + strlen(httpget->host) + 1;
temp = new char[size];
memset(temp, 0, size);
strcat(temp, httpget->host);
char* lptemp = temp + strlen(httpget->host);
for (int i = 0; i < finish - start; i++)
*(lptemp + i) = *(start + i);
}
else
{
temp = new char[finish - start + 1];
for (int i = 0; i < finish - start; i++)
*(temp + i) = *(start + i);
*(temp + (finish - start)) = 0;
}
if (!checkurl(temp))
{
url.push_back(temp);
cout <<"geturl:"<< temp << endl;
}
start = strstr(finish, "href=\"")+6;
}
}
void mainclass::imganalyze(char* doc, int docsize)
{
char* finish,*temp,*tempstart,*tempfinish;
int size;
char* start = strstr(doc, "<img") + 4;
while (start !=(char*)4 )
{
if (start > (doc + docsize))
return;
finish = strstr(start, "/>");
if (finish == NULL)
return;
char* lptemp = new char[finish - start + 1];
for (int i = 0; i < finish - start; i++)
*(lptemp + i) = *(start + i);
*(lptemp + (finish - start)) = 0;
tempstart = strstr(lptemp,"src=\"")+5;
if (tempstart == (char*)5)
{
start = strstr(finish, "<img") + 4;
continue;
}
tempfinish = strstr(tempstart, "\"");
if (*tempstart == '/')
{
size = tempfinish - tempstart + strlen(httpget->host) + 1;
temp = new char[size];
memset(temp, 0, size);
strcat(temp, httpget->host);
char* lpbuf= temp + strlen(httpget->host);
for (int i = 0; i < tempfinish - tempstart; i++)
*(lpbuf + i) = *(tempstart + i);
}
else
{
temp = new char[tempfinish - tempstart + 1];
for (int i = 0; i < tempfinish - tempstart; i++)
*(temp + i) = *(tempstart + i);
*(temp + (tempfinish - tempstart)) = 0;
}
delete[]lptemp;
EnterCriticalSection(imgvectorlock);
img.push_back(temp);
cout << "getimg:" << temp << endl;
LeaveCriticalSection(imgvectorlock);
start = strstr(finish, "<img") + 4;
}
}
DWORD mainclass::urlFun(LPVOID a)
{
mainclass* lp = (mainclass*)a;
lp->gethtml();
return 0;
}
DWORD mainclass::imgFun(LPVOID a)
{
mainclass* lp = (mainclass*)a;
lp->getimg();
return 0;
}
bool mainclass::checkurl(char * check)
{
for (int i = 0; i < url.size(); i++)
{
if (!strcmp(check, url.at(i)))
return true;
}
return false;
}
#include "tcp.h"
#include<ctime>
int getlen(char* buf)
{
int i, j, num = 0;
buf = strstr(buf, "Content-Length: ") + 16;
if (buf == (char*)16)
return -1;
char* temp = strstr(buf, "\r\n");
i = temp - buf;
for (j = 0; j <i; j++)
{
if ((*(buf + j) - '0') < 0 || (*(buf + j) - '0') > 9)
return -1;
num += ((*(buf + j) - '0') * pow(10, (i - j - 1)));
}
return num;
}
int tcpconnect::inibuf(char* src, char* cpy, int len)
{
int count, total,headlen;
char* temp;
temp = strstr(cpy, "\r\n\r\n") + 4;
if (temp == (char*)4 ||temp>(cpy+len))
return -1;
headlen = temp - cpy+1;
head = new char[headlen];
memset(head, 0, headlen);
for (count = 0; count < headlen-1; count++)
*(head+count) = *(cpy+count);
total = len - (temp - cpy);
for (count = 0; count < total; count++)
*(src + count) = *(temp + count);
return total;
}
tcpconnect::tcpconnect()
{
WSADATA wsaData;
WSAStartup(MAKEWORD(2, 2), &wsaData);
tcpsocket= socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
int iMode = 1;
ioctlsocket(tcpsocket, FIONBIO,(u_long FAR*)&iMode);
myaddr.sin_family = AF_INET;
myaddr.sin_port = htons(80);
}
bool tcpconnect::myconnect()
{
int i=0,j=0;
clock_t start, finish;
struct hostent *hptr = gethostbyname(host);
if (hptr == NULL)
return false;
myaddr.sin_addr.S_un.S_addr = inet_addr(inet_ntoa((*(struct in_addr*)hptr->h_addr)));
start = clock();
do {
connect(tcpsocket, (sockaddr *)&myaddr, sizeof(myaddr));
j = WSAGetLastError();
finish = clock();
if (finish - start > 2000)
return false;
} while (j !=10056);
return true;
}
int tcpconnect::tcpsend(char* sendbuf, int buflen)
{
int i;
do {
i = send(tcpsocket, sendbuf, buflen, 0);
} while (i != buflen);
return i;
}
int tcpconnect::tcprecv(char* buf,int buflen)
{
int i = recv(tcpsocket, buf, buflen, 0);
if (i <0)
return 0;
else
return i;
}
void tcpconnect::tcpclose()
{
closesocket(tcpsocket);
tcpsocket = INVALID_SOCKET;
tcpsocket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
int iMode = 1;
ioctlsocket(tcpsocket, FIONBIO, (u_long FAR*)&iMode);
delete[]host,host=NULL;
delete[]head,head=NULL;
delete[]getmsg,getmsg=NULL;
delete[]resource,resource=NULL;
delete[]buf,buf=NULL;
}
int tcpconnect::makehttpgetmsg()
{
int msglen= 25 + strlen(host) + strlen(resource) + 1;
getmsg = new char[msglen];
memset(getmsg, 0, msglen);
strcat(getmsg, "GET ");
strcat(getmsg, resource);
strcat(getmsg, " HTTP/1.1\r\nHost: ");
strcat(getmsg, host);
strcat(getmsg, "\r\n\r\n");
return msglen;
}
int tcpconnect::recvhtml()
{
clock_t start, finish;
int count=0, total, j;
char*tempbuf;
tempbuf = new char[1001];
memset(tempbuf, 0, 1001);
start = clock();
do
{
j = tcprecv(tempbuf+count, 1000-count);
count += j;
finish = clock();
if (finish - start > 2000)
{
break;
}
} while (count <1000);
total = getlen(tempbuf);
if (total == -1)
return -1;
buf = new char[total];
memset(buf, 0, total);
count = inibuf(buf, tempbuf, count);
delete[]tempbuf;
if (count > total || count < 0)
return -1;
start = clock();
j = 0;
do
{
j =tcprecv(buf + count, total - count);
count += j;
if(j!=0)
start = clock();
finish = clock();
if (finish - start > 10000)
return -1;
}while (count < total);
bufsize = total;
return total;
}
tcpconnect::~tcpconnect()
{
closesocket(tcpsocket);
WSACleanup();
}
void tcpconnect::seturl(char * temp)
{
if (strstr(temp, "http://") != 0)
temp = temp + 7;
if (strstr(temp, "https://") != 0)
temp = temp + 8;
char* srcpos = strstr(temp, "/");
if (srcpos == NULL)
{
resource = new char[2];
memset(resource,0,2);
strcpy(resource, "/");
host = new char[strlen(temp) + 1];
memset(host, 0, strlen(temp) + 1);
strcpy(host, temp);
}
else
{
resource = new char[strlen(srcpos)+1];
memset(resource, 0, strlen(srcpos) + 1);
strcpy(resource, srcpos);
host = new char[srcpos-temp+ 1];
memset(host, 0, srcpos - temp + 1);
for (int i = 0; i < srcpos - temp; i++)
host[i] = temp[i];
}
}
testmain.cpp
#include"engine.h"
extern char* getfilename(char* url);
void main()
{
mainclass* engine = new mainclass("www.baidu.com");
while (1);
}