使用HttpClient4t爬取Google搜索

vnetoolxw_87 2013-05-16 05:46:16

//查询关键字
String keyword = "lucene案例";
//中文编码
keyword = StringUtils.encode(keyword);

/************请求头信息begin**************/
Map<String, String> headers = new HashMap<String, String>();
headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0");
headers.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
headers.put("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
headers.put("Connection", "keep-alive");
headers.put("Host", "www.google.com.tw");
headers.put("Referer", "http://www.google.com.tw/");
headers.put("Cookie", "PREF=ID=4d2dc6153085c414:U=9b93a80a00c6b3ab:FF=0:NW=1:TM=1368676720:LM=1368681031:S=7QTClQO_Wxw-xxm4; NID=67=GzrhiTQQi9wX7fovCsOmQrE779tmIa5bwDhKiuTyqr95doIZVfrXl3v7OEMObVotc28BDemQG3rHU_T1QOpw5ugvQsC72doeBZ712TMBaybPejwYaIytMkHFoXbgPTvN");
/************请求头信息end**************/

/************请求参数begin**************/
Map<String, String> params = new HashMap<String, String>();
params.put("newwindow", "1");
params.put("q", keyword);
params.put("oq", keyword);
params.put("gs_l", "serp.3...101402.101402.2.102180.1.1.0.0.0.0.0.0..0.0...0.0...1c.1.12.serp.VikCJvSgme0");
params.put("bav", "on.2,or.");
params.put("fp", "fdc54ad21c4efd6b");
params.put("biw", "1077");
params.put("bih", "636");
params.put("tch", "1");
params.put("ech", "1");
params.put("psi", "jGqUUeOfD8KokAWsyoDQBA.1368681095489.3");
/************请求参数end**************/

//Google搜索后台处理程序URL链接
String url = "http://www.google.com.tw/search";

Long start = System.currentTimeMillis();
//计划爬几页
int crawlPage = 10;
//每页显示结果条数[Google一页默认10条]
int pageSize = 10;
Set<String> urlSet = new LinkedHashSet<String>();
for(int i=0; i < crawlPage; i++) {
int startIndex = i * pageSize;
//设置分页时的起始索引[从0开始计算]
params.put("start", startIndex+"");
String html = HttpClientUtils.getHTML(url,headers,params);
html = html.replace("\\x", "%");
Pattern pattern = Pattern.compile("(%\\w{2})+");
Matcher matcher = pattern.matcher(html);
while (matcher.find()) {
String text = matcher.group();
html = html.replace(text, StringUtils.decode(text));
}
html = html.replace("\\", "").replace("\n", "").replace("\r\n", "");
FileUtils.writeFile(html, "C:/SVG/test.html", "UTF-8", false);
Pattern pattern2 = Pattern.compile("(<h3 class=\"r\">)<a[^>]*>.*(</a></h3>)?");
Matcher matcher2 = pattern2.matcher(html);
while (matcher2.find()) {
String line = matcher2.group();
Pattern pattern3 = Pattern.compile("(<a[\\s+]*([^>h]|h(?!ref\b))*href[\\s+]*=[\\s+]*[('|\")]?)([^(\\s+|'|\")]*)([^>]*>)",Pattern.MULTILINE);
Matcher matcher3 = pattern3.matcher(line);
while (matcher3.find()) {
String link = matcher3.group();
if(link.indexOf("href=\"#\"") != -1 || link.indexOf("<a href") == -1 || link.indexOf("onmousedown") == -1 ||
link.indexOf("http://webcache.googleusercontent.com/search") != -1) {
continue;
}
Pattern pattern4 = Pattern.compile("<a\\s.*?href=\"([^\"]+)\"[^>]*>");
Matcher matcher4 = pattern4.matcher(link);
if (matcher4.find()) {
String text = matcher4.group(1);
if(!text.startsWith("http://")) {
continue;
}
//排除非<h3 class="r"></h3>之间的超链接
String prefix = "<h3 class=\"r\"><a href=\""+text+ "\"";
if(line.indexOf(prefix) == -1) {
continue;
}
String targetUrl = StringUtils.encode(text);
targetUrl = targetUrl.replaceAll("%2F","/");
targetUrl = targetUrl.replaceAll("%3A",":");
targetUrl = targetUrl.replaceAll("%3F","?");
targetUrl = targetUrl.replaceAll("%26","&");
targetUrl = targetUrl.replaceAll("\\+","%20");
//去重复
if(urlSet.contains(targetUrl)) {
continue;
} else {
urlSet.add(targetUrl);
System.out.println(targetUrl);
}
}
}
}
System.out.println("/////////////////////////////////////////////////////////////////////////////////////////\n");
}

Long end = System.currentTimeMillis();
long diff = end -start;
String haoshi = StringUtils.formatMillSecond(diff, "小时", "分", "秒");
System.out.println("总共耗时: [" + haoshi + "]");

下面是效果图:



没事写的玩玩的,求围观!
...全文
119 回复 打赏 收藏 转发到动态 举报
写回复
用AI写文章
回复
切换为时间正序
请发表友善的回复…
发表回复

81,092

社区成员

发帖
与我相关
我的任务
社区描述
Java Web 开发
社区管理员
  • Web 开发社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧