用CHttpConnection 取得的网乱数据是乱码(超难,高手请进来测试)
已知源网站是用gzip压缩过的
(最奇怪的是 用VC++抓取其它网页,就正常)
我用下面一段代码取数据(因为要设定http 的referer)所以用
sendrequest 方法打开数据,而不是用openurl 来打开数据
又用PHP 抓取同一网站的内容
PHP抓取出来是正确的
附两个程序的抓取代码:(请注意 VC 抓的那个内容连HTTP头都不见了,PHP有http头, 而且 两个程序写入文件,文件的内容都不一样)
VC++的抓取代码
CString ref_url = "http://www.s1122.com/app/member/FT_browse/index.php?uid=2c520623m877213l3284&langx=zh-cn&mtype=3";
CString web_url = "http://www.s1122.com/app/member/FT_browse/body_var.php?uid=2c520623m877213l3284&rtype=re&langx=zh-cn&mtype=3";
CString host = "www.s1122.com";
//host = "exshop.oicp.net";
CString sub_url = "/app/member/FT_browse/body_var.php?uid=2c520623m877213l3284&rtype=re&langx=zh-cn&mtype=3";
//sub_url = "/";
try
{
CInternetSession Session ;
CHttpConnection *pHttpConnect = Session.GetHttpConnection(host) ;
if( pHttpConnect )
{
CHttpFile* pFile = pHttpConnect->OpenRequest( CHttpConnection::HTTP_VERB_GET,
_T(sub_url),
NULL,
1,
NULL,
NULL,
INTERNET_FLAG_NO_COOKIES ); //
if ( pFile )
{
//pFile->AddRequestHeaders("Accept: image/png,*/*;q=0.5");
//pFile->AddRequestHeaders("Accept-Language: Big5");
//pFile->AddRequestHeaders("Accept-Encoding: gzip,deflate");
// pFile->AddRequestHeaders("Accept-Charset: big5");
//pFile->AddRequestHeaders("Keep-Alive: 300");
//pFile->AddRequestHeaders("Connection: keep-alive");
//pFile->AddRequestHeaders("Cookie: Key=somevalue;domain=abc.com") ;
pFile->AddRequestHeaders("Content-Type: text/html;charset=gb2312");
pFile->AddRequestHeaders("Accept: text/html");
pFile->AddRequestHeaders("Referer: "+ref_url);
pFile->AddRequestHeaders("User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)");
//pFile->AddRequestHeaders("Accept-Encoding: gzip, deflate");
pFile->SendRequest();
// 返回的HTML
CString s,str ;
CString tmp;
while (pFile->ReadString(s))
{
tmp = s;
str += s ;
}
CString str1;
str1 = str;
SetDlgItemText(IDC_EDIT1,str1);
FILE * fp;
fp = fopen("old.txt","w");
fwrite(str1,1,strlen(str1),fp);
fclose(fp);
fp = fopen("new.txt","w");
fwrite(str,1,strlen(str),fp);
fclose(fp);
//BIG52GBK(str.GetBuffer(strlen(str)));
//GBK2GB(str.GetBuffer(strlen(str)));
SetDlgItemText(IDC_EDIT2,str);
//AfxMessageBox(str) ;
// 取返回的COOKIE
//CString strInfo ;
//DWORD dw = 0 ;
// pFile->QueryInfo(HTTP_QUERY_SET_COOKIE ,strInfo ,&dw) ;
//if (strInfo.IsEmpty()==FALSE)
// AfxMessageBox(str) ;
pFile->Close();
delete pFile ;
}
pHttpConnect->Close() ;
delete pHttpConnect ;
}
}
catch( CInternetException *e )
{
e->Delete();
}
PHP 抓取代码
<?php
//Ƕȡwww.s1122.comģȡUID: 2c520623m877213l3284 ʱ䣺2007-03-17 15:04:05
set_time_limit(0);
$uid = "2c520623m877213l3284";
$web_url = "www.s1122.com";
//$web_url = "http://www.baidu.com/s";
$singbet_ip_address = $web_url;
$refer_url = 'http://'.$singbet_ip_address.'/app/member/FT_browse/index.php?uid='.$uid.'&langx=zh-cn&mtype=3';
$web_url = 'http://'.$singbet_ip_address.'/app/member/FT_browse/body_var.php?uid='.$uid.'&langx=zh-tw&rtype=r&mtype=3&page_no=0';
$request = "/app/member/FT_browse/body_var.php?uid=$uid&rtype=re&langx=zh-cn&mtype=3";
//echo $request;
$host = $singbet_ip_address;
//$web_url = "http://www.baidu.com/";
//$//ch = curl_init();
// set URL and other appropriate options
/*
echo $refer_url;
echo "<br>";
echo $web_url;
echo "<br>";
*/
$httpHeader = "GET $request HTTP/1.0\r\n";
$httpHeader .= "Referer: " . $refer_url . "\r\n";
$httpHeader .= "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)\r\n";
$httpHeader .= "Host: $host\r\n";
$httpHeader .= "\r\n\r\n";
$port = 80;
$fp = @fsockopen($host, $port);
$retStr = "";
if ( $fp ) {
fwrite($fp, $httpHeader);
while(! feof($fp)) {
$retStr .= fread($fp, 1024);
}
}
fclose($fp);
//echo $content;
//echo $content;
$fp = fopen("data.txt","w");
fwrite($fp,$retStr,strlen($retStr));
$retStr = explode("\r\n\r\n",$retStr);
$content = $retStr[1];
echo $content;
$content = gzinflate(substr($content,10));
$fp = fopen("data1.txt","w");
fwrite($fp,$content,strlen($content));
?>