51,396
社区成员




import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import util.ICrawler;
import util.InfoNode;
/**
*
* @author mgd
* @ 新浪公共消息爬取器
*/
public class SinaCrawler implements ICrawler {
//新浪微博 基地址
private final static String BASE_URL="http://weibo.com/";
private HttpURLConnection conn;
private ArrayList<InfoNode> nodeList;
//构造方法
public SinaCrawler() {
this.nodeList=new ArrayList<InfoNode>();
}
@Override
public ArrayList<InfoNode> getInfoList() throws IOException {
this.conn = (HttpURLConnection) new URL(BASE_URL).openConnection();
BufferedReader reader = new BufferedReader(new InputStreamReader(
this.conn.getInputStream(), "utf-8"));
int start;
int temp;
String strUid = "";
String strUname = "";
String strText = "";
String line = "";
while (true) {
line = reader.readLine();
if (line.endsWith("<div class=\"clearit\"></div>")) {
break;// 辅助停止条件
}
if (line.endsWith("<div class=\"itemt\">")) {
reader.readLine();
reader.readLine();// 跳过两行
line = reader.readLine();
start = line.indexOf("uid=") + 5;
temp = start + 10;
strUid = line.substring(start, temp);// 提取uid
start = line.indexOf('>', temp) + 1;
temp = line.indexOf('<', start);
strUname = line.substring(start, temp);// 提取uname
start = line.indexOf("</a>",temp) + 5;// </a>:长度为5
temp = line.indexOf('<', start);
strText = line.substring(start, temp);// 提取状态信息Text
this.nodeList.add(new InfoNode("新浪微博", strUid, strUname,strText));
if(this.nodeList.size() == 20){
break;//主退出条件
}
}
}
reader.close();
this.conn.disconnect();
return this.nodeList;
}
//测试代码
public static void main(String[] args) {
ICrawler crawler=new SinaCrawler();
try {
for(InfoNode iter : crawler.getInfoList()){
System.out.println(iter);
System.out.println();//调试使用
}
} catch (IOException e) {
e.printStackTrace();
}
}
}