甘肃企业http抓取莫名其妙失败

liangtu 2015-03-06 11:27:38
甘肃省工商网站http://xygs.gsaic.gov.cn/gsxygs/,通过企业名称抓取基本信息。

抓取企业的信息,按照步骤:打开首页、输入企业名称、输入验证码、搜索,
通过浏览器操作正常,通过http抓取,在搜索时,总是不返回要的数据(正常情况下,搜索后,应该返回结果列表。),也没有任何提示信息。


import java.io.File;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Gansu_Qyxx_Ver1
{
// 甘肃
public static void main(String[] args)
{
try
{
crawlerCompany("甘肃农业大学印刷厂");
// crawlerCompany("甘肃农业科技开发公司");
// crawlerCompany("甘肃农业审计事务所");//
// crawlerCompany("甘肃祥盛商贸服务有限公司");//
} catch (Exception e)
{
e.printStackTrace();
}

}

public static void crawlerCompany(String name)
{
try
{
crawler(name);
} catch (Exception e)
{
e.printStackTrace();
}
}

private static Logger logger = LoggerFactory.getLogger(GansuQYXY.class);
// public static String USER_AGENT =
// "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36";
public static String USER_AGENT = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; InfoPath.2)";
public static String HOST = "xygs.gsaic.gov.cn";
public static String ORIGIN = "http://xygs.gsaic.gov.cn";

private static void crawler(String name) throws Exception
{
String captchaContent = null;
HttpClient httpClient = new HttpClient();
httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(60000);
httpClient.getHttpConnectionManager().getParams().setSoTimeout(60000);
httpClient.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, "utf-8");
String url = "http://xygs.gsaic.gov.cn/gsxygs/";
GetMethod getMethod = new GetMethod(url);
getMethod.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
getMethod.addRequestHeader("Accept-Encoding", "gzip, deflate, sdch");
getMethod.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
getMethod.addRequestHeader("Connection", "keep-alive");
getMethod.addRequestHeader("User-Agent", USER_AGENT);
getMethod.addRequestHeader("Host", "xygs.gsaic.gov.cn");
httpClient.executeMethod(getMethod);
StringBuffer sb = new StringBuffer();
Header[] headers = getMethod.getResponseHeaders();
for (Header header : getMethod.getResponseHeaders("Set-Cookie"))
{
sb.append(header.getValue().replaceAll("; Path=/gsxygs", ""));
}
String url2 = "http://xygs.gsaic.gov.cn/gsxygs/main.jsp";
getMethod = new GetMethod(url2);
getMethod.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
getMethod.addRequestHeader("Accept-Encoding", "gzip, deflate, sdch");
getMethod.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
getMethod.addRequestHeader("Cache-Control", "max-age=0");
getMethod.addRequestHeader("Connection", "keep-alive");
getMethod.addRequestHeader("Content-Type", "application/x-www-form-urlencoded");
getMethod.addRequestHeader("User-Agent", USER_AGENT);
getMethod.addRequestHeader("Referer", "http://xygs.gsaic.gov.cn/gsxygs/");
getMethod.addRequestHeader("Host", "xygs.gsaic.gov.cn");
logger.info("访问main.jsp的cookie:" + sb.toString());
getMethod.addRequestHeader("Cookie", sb.toString());
httpClient.executeMethod(getMethod);

// 无用的验证码
String mapUrl1 = "http://118.180.7.222/anRepEn/authcode.jsp";
getMethod = new GetMethod(mapUrl1);
getMethod.addRequestHeader("Accept", "image/webp,*/*;q=0.8");
getMethod.addRequestHeader("Accept-Encoding", "gzip, deflate, sdch");
getMethod.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
getMethod.addRequestHeader("Connection", "keep-alive");
getMethod.addRequestHeader("User-Agent", USER_AGENT);
getMethod.addRequestHeader("Referer", "http://xygs.gsaic.gov.cn/gsxygs/main.jsp");
getMethod.addRequestHeader("Host", "118.180.7.222");
httpClient.executeMethod(getMethod);
FileUtils.writeByteArrayToFile(new File("E:\\甘肃\\" + System.currentTimeMillis() + ".jpg"), getMethod.getResponseBody());
logger.info("下载验证码1成功!!!");

String mapUrl = "http://xygs.gsaic.gov.cn/gsxygs/securitycode.jpg";
String mapUrlParams = "?v=" + System.currentTimeMillis();
getMethod = new GetMethod(mapUrl + mapUrlParams);
logger.info("下载验证码的cookie:" + sb.toString());
getMethod.addRequestHeader("Accept", "image/webp,*/*;q=0.8");
getMethod.addRequestHeader("Accept-Encoding", "gzip, deflate, sdch");
getMethod.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
getMethod.addRequestHeader("Connection", "keep-alive");
getMethod.addRequestHeader("User-Agent", USER_AGENT);
getMethod.addRequestHeader("Referer", "http://xygs.gsaic.gov.cn/gsxygs/main.jsp");
getMethod.addRequestHeader("Host", "xygs.gsaic.gov.cn");
getMethod.addRequestHeader("Cookie", sb.toString());
int statusCode = httpClient.executeMethod(getMethod);
FileUtils.writeByteArrayToFile(new File("E:\\甘肃\\" + System.currentTimeMillis() + ".jgp"), getMethod.getResponseBody());
logger.info("下载验证码2成功!!!");

String newCookieStr = "";
if (statusCode == 200)
{
headers = getMethod.getResponseHeaders();
for (Header header : getMethod.getResponseHeaders("Set-Cookie"))
{
logger.info("header.getValue()=" + header.getValue());
captchaContent = header.getValue().split("=")[1];
newCookieStr = "session_authcode=" + captchaContent + "; " + sb;
sb.append("; ");
sb.append(header.getValue().replaceAll("Path=/gsxygs", ""));

}
}
String url1 = "http://xygs.gsaic.gov.cn/gsxygs/pub!list.do";

PostMethod postMethod = new PostMethod(url1);
logger.info("captchaContent=" + captchaContent);
NameValuePair[] params =
{ new NameValuePair("browse", ""), new NameValuePair("loginName", ""), new NameValuePair("cerNo", ""), new NameValuePair("authCode", ""),
new NameValuePair("authCodeQuery", captchaContent), new NameValuePair("queryVal", name) };

postMethod.setRequestBody(params);
postMethod.addRequestHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
postMethod
.addRequestHeader(
"Accept",
"application/x-ms-application, image/jpeg, application/xaml+xml, image/gif, image/pjpeg, application/x-ms-xbap, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
postMethod.addRequestHeader("Accept-Encoding", "gzip,deflate");
// postMethod.addRequestHeader("Accept-Language", "zh-CN,zh;q=0.8");
postMethod.addRequestHeader("Accept-Language", "zh-CN");
// postMethod.addRequestHeader("Cache-Control", "max-age=0");
postMethod.addRequestHeader("Cache-Control", "no-cache");
// postMethod.addRequestHeader("Connection", "keep-alive");
postMethod.addRequestHeader("Connection", "Keep-Alive");
postMethod.addRequestHeader("Content-Type", "application/x-www-form-urlencoded");
postMethod.addRequestHeader("User-Agent", USER_AGENT);
postMethod.addRequestHeader("Referer", "http://xygs.gsaic.gov.cn/gsxygs/main.jsp");
postMethod.addRequestHeader("Host", HOST);
postMethod.addRequestHeader("Origin", ORIGIN);
// logger.info("访问list.do的cookie:" + sb.toString());
logger.info("访问list.do的cookie:" + newCookieStr);
postMethod.addRequestHeader("Cookie", newCookieStr);
// httpClient2.executeMethod(postMethod);
httpClient.executeMethod(postMethod);
httpClient.executeMethod(postMethod);
logger.info("甘肃执行查询状态 = " + statusCode);
if (statusCode != 200)
{
return;
}
StringBuffer sb1 = new StringBuffer();
for (Header header : postMethod.getResponseHeaders("Set-Cookie"))
{
sb1.append(header.getValue().replaceAll("Path=/gsxygs", ""));
}
String htmlStr = postMethod.getResponseBodyAsString();
System.out.println(htmlStr);
// Map<String,String> map = new HashMap<String,String>() ;
// map.put("authCodeQuery", captchaContent);
// map.put("queryVal", company.getQymc()) ;
// Document docDetail = Jsoup.connect("http://xygs.gsaic.gov.cn/gsxygs/pub!list.do").data(map).post();
// System.out.println(docDetail);

if (getMethod != null)
{
getMethod.releaseConnection();
getMethod = null;
}
if (postMethod != null)
{
postMethod.releaseConnection();
postMethod = null;
}

if (httpClient != null)
{
httpClient.getHttpConnectionManager().closeIdleConnections(0);
}

System.out.println("释放资源成功!!!");
}
}



...全文
262 3 打赏 收藏 转发到动态 举报
写回复
用AI写文章
3 条回复
切换为时间正序
请发表友善的回复…
发表回复
u011066523 2016-01-20
  • 打赏
  • 举报
回复
楼主解决了么?我也遇到嘞~求解
missMeyo 2015-03-06
  • 打赏
  • 举报
回复
应该是请求搞错了。只需要解决登录和需要访问的页面二个链接就好。
liangtu 2015-03-06
  • 打赏
  • 举报
回复
引用 1 楼 missMeyo 的回复:
应该是请求搞错了。只需要解决登录和需要访问的页面二个链接就好。
1、链接没问题 2、不需要登录

67,513

社区成员

发帖
与我相关
我的任务
社区描述
J2EE只是Java企业应用。我们需要一个跨J2SE/WEB/EJB的微容器,保护我们的业务核心组件(中间件),以延续它的生命力,而不是依赖J2SE/J2EE版本。
社区管理员
  • Java EE
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧