手写的爬虫程序,程序可以成功运行,只是效率太低,十几秒才爬一条数据,求大神指点提高效率,谢谢!!

struts_hibernate_sp 2013-12-04 10:14:30
import .....
/**
* 获取****的数据
*/
public class DoMain3 {
/**
* 根据网页url获取页面内容
*/
public String getHtmlString(String url){
String hs="";
try {
URL u = new URL(url);
HttpURLConnection conn = (HttpURLConnection)u.openConnection();
conn.setRequestProperty("User-Agent","MSIE 7.0");
StringBuffer HtmlString = new StringBuffer();
BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(),"utf-8"));
String line="";
while((line=br.readLine())!=null){
HtmlString.append(line+"\n");
}
hs=HtmlString.toString();
System.out.println(url);
} catch (Exception e) {
System.out.println("URL地址加载出错!!");
e.printStackTrace();
}
return hs;
}
public static void main(String rags[]){
Dao d = new Dao();
DoMain3 dm = new DoMain3();
String title="";
String section="";
String content="";
String contentTitle="";
int count=110;

String url="http://*************************" ;
if(d.createTable()){
System.out.println("建表成功!!!");
try {
//加载标题页面
Document doc = Jsoup.parse(dm.getHtmlString(url));
Element titles = doc.getElementById("maincontent");
Elements lis=titles.getElementsByTag("li");
//*********************标题****************************
for(int i=1;i<lis.size();i++){
Elements a = lis.get(i).getElementsByTag("a");
if(a.toString().equals("")){
title=lis.get(i).text();
contentTitle=title;
String data[]={contentTitle,title,section,content,url};
if(d.pinsertData(data)){
System.out.println("第"+(i+1)+"题数据插入成功!!!");
System.out.println("*****************"+count+"*****************");
}else{
System.out.println("第"+(i+1)+"题节数据插入失败!!!");
System.out.println("*****************"+count+"*****************");
break;
}
count++;
continue;
}else{
title=a.get(0).text();
url="http://****************"+a.get(0).attr("href");
//加载章节页面
Document doc2=Jsoup.parse(dm.getHtmlString(url));
Element sections =doc2.getElementById("maincontent");
Elements ls = sections.getElementsByTag("li");
//**********************节************************
for(int j=0;j<ls.size();j++){
Elements link = ls.get(j).getElementsByTag("a");
if(link.toString().equals("")){
section=ls.get(j).text();
contentTitle=title+" "+section;
}else{
section = link.get(0).text();
url="http:*******************"+link.get(0).attr("href");
//加载内容页面
Document doc3=Jsoup.parse(dm.getHtmlString(url));
Element contents=doc3.getElementById("maincontent");
content=contents.text();
//处理内容字符串
content=content.substring(content.indexOf("?")+"?".length());
content=content.replace("'", "''");
contentTitle=title+" "+section;
}
System.out.println("****************"+count+"******************");
System.out.println("正在读第"+(i+1)+"题"+(j+1)+"节");


//往数据库插入数据
String data[]={contentTitle,title,section,content,url};
if(d.pinsertData(data)){
System.out.println("第"+(i+1)+"题"+(j+1)+"节数据插入成功!!!");
System.out.println("*****************"+count+"*****************");
count++;
}else{
System.out.println("第"+(i+1)+"题"+(j+1)+"节数据插入失败!!!");
System.out.println("*****************"+count+"*****************");
break;
}
}//end for
}

System.out.println("第"+(i+1)+"题采集完毕");


}//end for

System.out.println("采集完毕!!");

} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

...全文
290 8 打赏 收藏 转发到动态 举报
写回复
用AI写文章
8 条回复
切换为时间正序
请发表友善的回复…
发表回复
冰涛 2015-10-09
  • 打赏
  • 举报
回复
你可以参考这个看看
冰涛 2015-10-09
  • 打赏
  • 举报
回复
https://www.baidu.com/s?wd=jsoup%20%E5%A4%AA%E6%85%A2&rsv_spt=1&rsv_iqid=0xa4c58e5b0001928e&issp=1&f=3&rsv_bp=1&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&oq=java%2520write%2520%255Ct%2520%25E6%25B2%25A1%25E7%2594%25A8&inputT=11038&rsv_t=64ecKnkhyG%2Bspt6MnEr2Ttfue0gE4iYduVY65jj1n6jePnM1gL%2FwO3GVvk4XcSPt8z5R&rsv_pq=e14f800e00019f3d&sug=jsoup%E8%A7%A3%E6%9E%90html&rsv_sug3=27&rsv_sug1=19&rsv_n=1&rsv_sug2=0&prefixsug=jsoup%2520%25E5%25A4%25AA%25E6%2585%25A2&rsp=0&rsv_sug4=12115
  • 打赏
  • 举报
回复
** * 获取**************的数据 * @author wf * */ public class DoMain5 { public Document getDoc(String url){ Document doc=null; try { doc=Jsoup.connect(url).get(); } catch (Exception e) { System.out.println("文档解析失败!!"); e.printStackTrace(); } return doc; } public static void main(String rags[]){ Dao d = new Dao(); DoMain5 dm = new DoMain5(); String title=""; String section=""; String content=""; String contentTitle=""; int count=630; String url="******************" ; if(d.createTable()){ System.out.println("建表成功!!!"); try { Document doc = dm.getDoc(url); System.out.println(doc); Element titles = doc.getElementById("maincontent"); Elements lis=titles.getElementsByTag("li"); //*********************标题**************************** for(int i=1;i<lis.size();i++){ Elements a = lis.get(i).getElementsByTag("a"); if(a.toString().equals("")){ title=lis.get(i).text(); contentTitle=title; String data[]={contentTitle,title,section,content,url}; if(d.pinsertData(data)){ System.out.println("第"+(i+1)+"题数据插入成功!!!"); System.out.println("*****************"+count+"*****************"); }else{ System.out.println("第"+(i+1)+"题节数据插入失败!!!"); System.out.println("*****************"+count+"*****************"); break; } count++; continue; }else{ title=a.get(0).text(); url="http:***************"+a.get(0).attr("href"); Document doc2=dm.getDoc(url); Element sections =doc2.getElementById("maincontent"); Elements ls = sections.getElementsByTag("li"); //**********************节************************ for(int j=507;j<ls.size();j++){ Elements link = ls.get(j).getElementsByTag("a"); if(link.toString().equals("")){ section=ls.get(j).text(); contentTitle=title+" "+section; }else{ section = link.get(0).text(); url="http:****************"+link.get(0).attr("href"); Document doc3=dm.getDoc(url); Element contents=doc3.getElementById("maincontent"); content=contents.text(); //处理内容字符串 content=content.substring(content.indexOf("?")+"?".length()); content=content.replace("'", "''"); contentTitle=title+" "+section; } System.out.println("****************"+count+"******************"); System.out.println("正在读第"+(i+1)+"题"+(j+1)+"节"); String data[]={contentTitle,title,section,content,url}; if(d.pinsertData(data)){ System.out.println("第"+(i+1)+"题"+(j+1)+"节数据插入成功!!!"); System.out.println("*****************"+count+"*****************"); count++; }else{ System.out.println("第"+(i+1)+"题"+(j+1)+"节数据插入失败!!!"); System.out.println("*****************"+count+"*****************"); break; } }//end for } System.out.println("第"+(i+1)+"题采集完毕"); break; }//end for System.out.println("采集完毕!!"); } catch (Exception e) { e.printStackTrace(); } 经过各位大声指点修改后 这个程序效率有明显提高,不过现在运行起来随时随地会抛出下面两个异常,还请各位大虾指点怎么解决: 1.java.net.SocketTimeoutException: Read timed out at java.net.SocketInputStream.socketRead0(Native Method) at java.net.SocketInputStream.read(SocketInputStream.java:129) at java.io.BufferedInputStream.fill(BufferedInputStream.java:218) at java.io.BufferedInputStream.read1(BufferedInputStream.java:258) at java.io.BufferedInputStream.read(BufferedInputStream.java:317) at sun.net.www.http.HttpClient.parseHTTPHeader(HttpClient.java:687) at sun.net.www.http.HttpClient.parseHTTP(HttpClient.java:632) at sun.net.www.protocol.http.HttpURLConnection.getInputStream (HttpURLConnection.java:1064) at java.net.HttpURLConnection.getResponseCode(HttpURLConnection.java:373) at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:429) at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:410) at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:164) at org.jsoup.helper.HttpConnection.get(HttpConnection.java:153) at com.wanfang.dousact.DoMain5.getDoc(DoMain5.java:35) at com.wanfang.dousact.DoMain5.main(DoMain5.java:61) 2.java.net.SocketTimeoutException: connect timed out at java.net.PlainSocketImpl.socketConnect(Native Method) at java.net.PlainSocketImpl.doConnect(PlainSocketImpl.java:333) at java.net.PlainSocketImpl.connectToAddress(PlainSocketImpl.java:195) at java.net.PlainSocketImpl.connect(PlainSocketImpl.java:182) at java.net.SocksSocketImpl.connect(SocksSocketImpl.java:366) at java.net.Socket.connect(Socket.java:519) at sun.net.NetworkClient.doConnect(NetworkClient.java:158) at sun.net.www.http.HttpClient.openServer(HttpClient.java:394) at sun.net.www.http.HttpClient.openServer(HttpClient.java:529) at sun.net.www.http.HttpClient.<init>(HttpClient.java:233) at sun.net.www.http.HttpClient.New(HttpClient.java:306) at sun.net.www.http.HttpClient.New(HttpClient.java:323) at sun.net.www.protocol.http.HttpURLConnection.getNewHttpClient (HttpURLConnection.java:852) at sun.net.www.protocol.http.HttpURLConnection.plainConnect (HttpURLConnection.java:793) at sun.net.www.protocol.http.HttpURLConnection.connect(HttpURLConnection.java:718) at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:425) at org.jsoup.helper.HttpConnection$Response.execute(HttpConnection.java:410) at org.jsoup.helper.HttpConnection.execute(HttpConnection.java:164) at org.jsoup.helper.HttpConnection.get(HttpConnection.java:153) at com.wanfang.dousact.DoMain5.getDoc(DoMain5.java:35) at com.wanfang.dousact.DoMain5.main(DoMain5.java:87)
萧萧可乐 2013-12-05
  • 打赏
  • 举报
回复
多线程+提高带宽
  • 打赏
  • 举报
回复
一开始就是用的jsoup 效率比这个还低,就在 Document doc = Jsoup.parse(method.getResponseBodyAsString()); 这一步就走不动了,很头疼,有人建议我用sax解析,但是sax能用来解析html吗?
KK3K2005 2013-12-04
  • 打赏
  • 举报
回复
开多个线程跑
Deep_Learning 2013-12-04
  • 打赏
  • 举报
回复
用jsoup吧,很简单,也很好爬
  • 打赏
  • 举报
回复
主要是这两句,debug的时候老是在这两句停好长时间 1.BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(),"utf-8")) 2.while((line=br.readLine())!=null){ HtmlString.append(line+"\n"); }

61,112

社区成员

发帖
与我相关
我的任务
社区描述
层叠样式表(英文全称:Cascading Style Sheets)是一种用来表现HTML(标准通用标记语言的一个应用)或XML(标准通用标记语言的一个子集)等文件样式的计算机语言。
社区管理员
  • HTML(CSS)社区
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告
暂无公告

试试用AI创作助手写篇文章吧