51,410
社区成员
发帖
与我相关
我的任务
分享@Slf4j
@Component
public class SpiderUtil {
@Resource
private DynamicIpUtil dynamicIpUtil;
/**
* 根据url爬取页面信息
*
* @param url url
* @return 页面信息
*/
public Document spiderDocument(String url) {
Document pageDoc = null;
try {
Connection con= Jsoup.connect(url)
.userAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)")
.timeout(5000);
/*.ignoreHttpErrors(true)
.followRedirects(true)*/
Connection.Response resp = con.execute();
if (resp.statusCode() == 200){
pageDoc = con.get();
} else {
log.error("http status error");
dynamicIpUtil.changeMyIp();
spiderDocument(url);
}
if(pageDoc == null || pageDoc.toString().trim().equals("")) {// 表示ip被拦截或者其他情况
log.error("ip被拦截 无内容");
dynamicIpUtil.changeMyIp();
spiderDocument(url);
}
} catch (Exception e) {
log.error("ip被拦截 异常: {}", e);
dynamicIpUtil.getMyIpInfo();
dynamicIpUtil.changeMyIp();
spiderDocument(url);
}
if (ipDefensed(url, pageDoc)) {
// 如果被ip限制了,更换动态ip
dynamicIpUtil.changeMyIp();
spiderDocument(url);
}
return pageDoc;
}
/**
* 判断ip是否被封
*
* @param pageDoc 页面信息
* @return ip
*/
private boolean ipDefensed(String url, Document pageDoc) {
boolean ipDefensed = false;
if (url.contains("anjuke.com")) {
ipDefensed = AJKIpDefense(pageDoc);
}
return ipDefensed;
}
/**
* 安居客判断ip是否被封
*
* @param pageDoc 页面信息
*/
private boolean AJKIpDefense(Document pageDoc) {
log.error("ip 被拦截 安居客");
boolean ajkppDefensed = false;
String title = pageDoc.title();
if (title.equals("访问验证-安居客")) {
ajkppDefensed = true;
}
return ajkppDefensed;
}
}
@Slf4j
@Component
public class DynamicIpUtil {
private static List<String[]> ipAndPorts = new ArrayList<String[]>();
private static Integer ipPageNum = 1;
/**
* 更换动态ip
*/
public void changeMyIp() {
String [] ipAndPort = getDynamicIpAndPort();
String ip = ipAndPort[0];
String port = ipAndPort[1];
System.setProperty("http.maxRedirects", "50");
System.setProperty("https.maxRedirects", "50");
System.getProperties().setProperty("proxySet", "true");
System.getProperties().setProperty("http.proxyHost", ip);
System.getProperties().setProperty("http.proxyPort", port);
System.getProperties().setProperty("https.proxyHost", ip);
System.getProperties().setProperty("https.proxyPort", port);
}
/**
* 获取ip信息
*/
public void getMyIpInfo(){
try {
Document ipDoc = Jsoup.connect("http://www.ip.cn")
.userAgent("Mozilla")
.timeout(3000)
.get();
if(ipDoc != null){
String ipInfo = ipDoc.select(".well").first().text();
log.info("更换ip 成功: {}", ipInfo);
}
} catch (Exception e) {
log.info("暂不能获取ip 信息");
}
}
/**
* 获取动态ip
*
* @return 动态ip
*/
private String[] getDynamicIpAndPort() {
String[] ipAndPort = null;
if (ipAndPorts != null && ipAndPorts.size() > 0) {
ipAndPort = ipAndPorts.get(0);
ipAndPorts.remove(0);
} else {
try {
Document pageDoc = Jsoup.connect("http://www.xicidaili.com/wn/" + ipPageNum)
.userAgent("Mozilla")
.timeout(5000)
.get();
Elements elements = pageDoc.select("tr.odd");
ipPageNum ++;
if(ipPageNum > 400){
ipPageNum = 1;
}
for(Element element : elements){
String[] ipPort = new String[2];
String ip = element.child(1).text();
String port = element.child(2).text();
String noName = element.child(4).text();
// if(!noName.equals("高匿")){
// continue;
// }
String speedStr = element.child(6).select(".bar").first().attr("title");
double speed = Double.valueOf(speedStr.substring(0, speedStr.indexOf("秒")));
String timeStr = element.child(7).select(".bar").first().attr("title");
double time = Double.valueOf(timeStr.substring(0, timeStr.indexOf("秒")));
if(speed <= 1 && time <= 1){
ipPort[0] = ip;
ipPort[1] = port;
ipAndPorts.add(ipPort);
}
}
return getDynamicIpAndPort();
} catch (IOException e) {
log.error("get DynamicIpError error info :\n {}", e);
}
}
return ipAndPort;
}
}