获取网页内容只能获取了一部分,是什么原因呢?

angel0829 2008-05-08 10:34:23

#include <string>
#include <iostream>
#include <fstream>
#include "winsock2.h"
#include <time.h>

#pragma comment(lib, "ws2_32.lib")

using namespace std;

#define DEFAULT_PAGE_BUF_SIZE 1048576
#define HEADER_BUF_SIZE 1024

void main()
{
WSADATA wsaData;
int err;
err = WSAStartup(MAKEWORD(2,2), &wsaData);
if( err != 0 )
{
return;
}

clock_t start, finish;
double duration;

start = clock();

char host[] = "www.szedu.com.cn";
char *request = "GET / HTTP/1.0\r\nHost: www.sina.com.cn\r\nConnection: Keep-Alive\r\n\r\n";

struct hostent *hp;
hp = gethostbyname(host);
if(hp == NULL)
{
cout << "gethostbyname() error in GetIpByHost: " << host << endl;
return;
}

struct in_addr inAddr;
LPSTR lpAddr;
lpAddr = hp->h_addr;
memmove(&inAddr,lpAddr,4);

int sock, ret = 0, optval = 1;
struct sockaddr_in sa;
unsigned long inaddr;
sa.sin_family = AF_INET;
sa.sin_port = htons(80);
sa.sin_addr.s_addr = inet_addr(inet_ntoa(inAddr));

sock = socket(AF_INET, SOCK_STREAM, 0);
connect(sock, (SOCKADDR*)&sa, sizeof(sa));
if(sock == -1)
{
return;
}
if(sock == -2)
{
return;
}

ret = send(sock, request, strlen(request), 0);

char headerBuf[1024];
memset(headerBuf, 0, 1024);
ret = read_header(sock, headerBuf);
if(ret < 0)
{
closesocket(sock);
return;
}

if(strlen(headerBuf) == 0)
{
cout << "strlen(headerBuf) = 0" << headerBuf << endl;
closesocket(sock);
return;
}

int leng = GetContentLength(headerBuf);
int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;

char *pageBuf;
pageBuf = (char *)malloc(m_nContentLength);
memset(pageBuf, 0, m_nContentLength);

int bytesRead = 0;
int pre_ret = 0;
while(ret != SOCKET_ERROR)
{
ret = recv(sock, pageBuf + bytesRead, m_nContentLength, 0);
if(ret == 0)
{
break;
}

if(ret == -1 && pre_ret == 0)
{
closesocket(sock);
if(pageBuf)
{
free(pageBuf); pageBuf=NULL;
}
cout << "read()'s retval=-1" << endl;

return;
}
else if(ret == -1 && pre_ret)
{
break;
}

pre_ret = ret;
bytesRead += ret;
#if 0
if(ret >= m_nContentLength)
{
pageBuf = (char *)realloc(pageBuf, bytesRead + m_nContentLength);
if(pageBuf == NULL)
{
closesocket(sock);
if(pageBuf)
{
free(pageBuf); pageBuf=NULL;
}
cout << "realloc()" << endl;

return;
}
}
#endif
}
pageBuf[bytesRead] = '\0';

cout << bytesRead << endl;

ofstream ofs;
ofs.open("ofs.txt");
ofs << pageBuf << endl;

ofs.close();
free(pageBuf);
closesocket(sock);
WSACleanup();

finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
cout << "have cost " << duration << " seconds\n";

return;
}



这是我写的获取远程网页内容的简单代码,然后出现了获取网页只能获取一部分的问题,在VC下单步跟踪的话,可以获取所有内容,直接运行的话就不行了,请高手指点下原因。十分感谢。
...全文
147 8 打赏 收藏 转发到动态 举报
写回复
用AI写文章
8 条回复
切换为时间正序
请发表友善的回复…
发表回复
angel0829 2008-05-13
  • 打赏
  • 举报
回复
高手进来解答一下啊,拜托了。
angel0829 2008-05-13
  • 打赏
  • 举报
回复
高手进来解答一下啊,拜托了。
angel0829 2008-05-11
  • 打赏
  • 举报
回复
又按照你说的修改了,可惜还是一样的结果。

如果把recv的第三个参数修改为m_nContentLength-bytesRead的话,运行好几分钟才能获取到


#include <string>
#include <iostream>
#include <fstream>
#include "winsock2.h"
#include <time.h>

#pragma comment(lib, "ws2_32.lib")

using namespace std;

#define DEFAULT_PAGE_BUF_SIZE 1048576

void main()
{
WSADATA wsaData;
int err;
err = WSAStartup(MAKEWORD(2,2), &wsaData);
if( err != 0 )
{
return;
}

clock_t start, finish;
double duration;

start = clock();

char host[] = "www.sina.com.cn";
char *request = "GET / HTTP/1.0\r\nHost: www.sina.com.cn\r\nConnection: Close\r\n\r\n";

struct hostent *hp;
hp = gethostbyname(host);
if(hp == NULL)
{
cout << "gethostbyname() error in GetIpByHost: " << host << endl;
return;
}

struct in_addr inAddr;
LPSTR lpAddr;
lpAddr = hp->h_addr;
memmove(&inAddr,lpAddr,4);

int sock, ret = 0, optval = 1;
struct sockaddr_in sa;
unsigned long inaddr;
sa.sin_family = AF_INET;
sa.sin_port = htons(80);
sa.sin_addr.s_addr = inet_addr(inet_ntoa(inAddr));

sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
connect(sock, (SOCKADDR*)&sa, sizeof(sa));
if(sock == -1)
{
return;
}
if(sock == -2)
{
return;
}

ret = send(sock, request, strlen(request), 0);

int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;

char *pageBuf;
pageBuf = (char *)malloc(m_nContentLength);
memset(pageBuf, 0, m_nContentLength);

int bytesRead = 0;
while(ret != SOCKET_ERROR)
{
ret = recv(sock, pageBuf + bytesRead, m_nContentLength, 0);

if(ret > 0)
{
bytesRead += ret;
}
}
pageBuf[bytesRead] = '\0';

cout << bytesRead << endl;

ofstream ofs;
ofs.open("ofs.txt");
ofs << pageBuf << endl;

ofs.close();
free(pageBuf);
closesocket(sock);
WSACleanup();

finish = clock();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
cout << "have cost " << duration << " seconds\n";

return;
}
zhenhaojia 2008-05-11
  • 打赏
  • 举报
回复
Connection: Keep-Alive\ ------> Connection: close
zhenhaojia 2008-05-11
  • 打赏
  • 举报
回复
Connection: Keep-Alive\, 问题出在这里。
试一下:Connection: close
ucMIPS 2008-05-09
  • 打赏
  • 举报
回复
猪头,至少两个修改才看得到比较完整的页面

1 ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0);


2 ofs << headerBuf << endl;
ofs << pageBuf << endl;


angel0829 2008-05-09
  • 打赏
  • 举报
回复
楼上的,我按照你的修改了,可是获取一个页面需要130多秒。。。。

能否再指点指点?不甚感激啊。

2,760

社区成员

发帖
与我相关
我的任务
社区描述
搜索引擎的服务器通过网络搜索软件或网络登录等方式,将Internet上大量网站的页面信息收集到本地,经过加工处理建立信息数据库和索引数据库。
社区管理员
  • 搜索引擎技术社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧