62,615
社区成员
发帖
与我相关
我的任务
分享
package org.crawler;
import java.util.concurrent.LinkedBlockingQueue;
public class GetPage implements Runnable{
private VisitedUrl visitedUrl;
private LinkedBlockingQueue<String> unvisitedUrl;
private LinkedBlockingQueue<WebPage> webPageDB;
public GetPage(LinkedBlockingQueue<WebPage> _webPageDB,
VisitedUrl _visitedUrl,LinkedBlockingQueue<String> _unvisitedUrl){
this.webPageDB = _webPageDB;
this.visitedUrl = _visitedUrl;
this.unvisitedUrl = _unvisitedUrl;
}
@Override
public void run() {
while(!Thread.interrupted()){
if(!unvisitedUrl.isEmpty()){
try {
WebPage webPage = new WebPage();
String visitingUrl;
visitingUrl = unvisitedUrl.take();
String html = new GetHTML().GetHTML(visitingUrl);
visitedUrl.addVisitedUrl(visitingUrl);
System.out.println(visitingUrl);
webPage.setUrl(visitingUrl);
webPage.setContent(html);
webPageDB.put(webPage);
} catch (Exception e) {
e.printStackTrace();
}
}
else {
visitedUrl.waitFor();
}
}
}
}
package org.crawler;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
public class Parse implements Runnable{
private LinkedBlockingQueue<WebPage> webPageDB;
private VisitedUrl visitedUrl;
private LinkedBlockingQueue<String> unvisitedUrl;
public Parse(LinkedBlockingQueue<WebPage> _webPageDB,
VisitedUrl _visitedUrl,LinkedBlockingQueue<String> _unvisitedUrl){
this.webPageDB = _webPageDB;
this.visitedUrl = _visitedUrl;
this.unvisitedUrl = _unvisitedUrl;
}
@Override
public void run() {
while(!Thread.interrupted()){
if(!webPageDB.isEmpty()){
WebPage visitingPage;
try {
visitingPage = webPageDB.take();
Set<String> links = abc.extractLinks(visitingPage.getUrl(), visitingPage.getContent());
addUnvisitedUrl(links);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
else {
visitedUrl.waitFor();
}
}
}
public synchronized void addUnvisitedUrl(Set<String> links){
for (String link:links)
{
if(! visitedUrl.contains(link) && !unvisitedUrl.contains(link) && link != null && !link.trim().equals("") ){
unvisitedUrl.add(link);
}
}
}
public static Set<String> extractLinks(String url , String html) {
//用htmlparser从html里解析处所有的链接
....
Set<String> links = new HashSet<String>();
return links;
}
}
package org.crawler;
import java.util.HashSet;
public class VisitedUrl {
private HashSet<String> visitedUrl = new HashSet<String>();
public synchronized void addVisitedUrl(String url){
synchronized(visitedUrl){
visitedUrl.add(url);
}
notifyAll();
}
public synchronized boolean contains(String url){
boolean contains = true;
synchronized(visitedUrl){
if(visitedUrl.contains(url))
contains = true;
else
contains = false;
}
notifyAll();
return contains;
}
public synchronized void waitFor(){
try {
wait();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
package org.crawler;
public class WebPage {
private String url;
private String content;
public synchronized String getUrl() {
return url;
}
public synchronized void setUrl(String url) {
this.url = url;
}
public synchronized String getContent() {
return content;
}
public synchronized void setContent(String content) {
this.content = content;
}
}
package org.crawler;
import java.util.HashSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
public class Test {
public static void main(String[] args){
LinkedBlockingQueue<WebPage> webPageDB = new LinkedBlockingQueue<WebPage>();
VisitedUrl visitedUrl = new VisitedUrl();
LinkedBlockingQueue<String> unvisitedUrl = new LinkedBlockingQueue<String>();
String url = "http://www.test.com";
unvisitedUrl.add(url);
BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>();
ThreadPoolExecutor executor = new ThreadPoolExecutor(5,10,1,TimeUnit.DAYS,queue);
executor.execute(new GetPage(webPageDB,visitedUrl,unvisitedUrl));
executor.execute(new Parse(webPageDB,visitedUrl,unvisitedUrl));
executor.shutdown();
}
}