2,100
社区成员
发帖
与我相关
我的任务
分享
public void process(Page page) {
//省略了一些代码,主要是思路
List<String> links = = page.getHtml().links().regex(info.getUrlReg()).all();
for (String url : links) {
Request request = new Request();
request.setUrl(url);
Map<String, Object> extras = new HashMap<String, Object>();
if (startPage) {
extras.put("_level", START_DEPTH + 1);
} else {
//获取上层页面的深度再加一就是这个URL的深度
extras.put("_level", (Integer) page.getRequest().getExtra("_level") + 1);
}
request.setExtras(extras);
page.addTargetRequest(request);
}
}
然后在scheduler里面做限制深度的处理
public class DepthScheduler extends QueueScheduler {
private int levelLimit = 3;
public DepthScheduler() {
}
public DepthScheduler(int levelLimit) {
this.levelLimit = levelLimit;
}
@Override
public void push(Request request, us.codecraft.webmagic.Task task) {
if (request.getExtra("_level") == null || ((Integer) request.getExtra("_level")) <= levelLimit) {
super.push(request, task);
}
}
}