网络爬虫---如何取得网页本的文本信息并保存到txt中

jyf823 2011-04-11 02:27:37

想抓取某网页，然后取得网页中的特定信息，并保存到txt中，求高手指点！

...全文

1733 8 打赏收藏转发到动态举报

写回复

用AI写文章

8 条回复

切换为时间正序

请发表友善的回复…

发表回复

星海。 2011-07-14

打赏
举报

用正则表达式就行了呃！

在后台抓取网页中需要的内容，然后把内容存入txt文件中就OK了
至于具体的实现就要自己研究了

林R 2011-07-13

打赏
举报

xue xi le `````````

turing-complete 2011-07-12

打赏
举报

阁下，玩儿过新浪微博没有？
给个例子你看看，怎么提取首页滚动消息的

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.net.HttpURLConnection;

import java.net.URL;

import java.util.ArrayList;



import util.ICrawler;

import util.InfoNode;



/**

 * 

 * @author mgd

 * @ 新浪公共消息爬取器

 */

public class SinaCrawler implements ICrawler {



	//新浪微博 基地址

	private final static String BASE_URL="http://weibo.com/";	

	

	private HttpURLConnection conn;

	

	private ArrayList<InfoNode> nodeList;

	

	//构造方法

	public SinaCrawler() {

		this.nodeList=new ArrayList<InfoNode>();

	}



	

	@Override

	public ArrayList<InfoNode> getInfoList() throws IOException {

			this.conn = (HttpURLConnection) new URL(BASE_URL).openConnection();

		BufferedReader reader = new BufferedReader(new InputStreamReader(

				this.conn.getInputStream(), "utf-8"));

		int start;

		int temp;

		String strUid = "";

		String strUname = "";

		String strText = "";

		String line = "";

		while (true) {

			line = reader.readLine();

			if (line.endsWith("<div class=\"clearit\"></div>")) {

				break;// 辅助停止条件

			}

			if (line.endsWith("<div class=\"itemt\">")) {

				reader.readLine();

				reader.readLine();// 跳过两行

				line = reader.readLine();

				start = line.indexOf("uid=") + 5;

				temp = start + 10;

				strUid = line.substring(start, temp);// 提取uid

				start = line.indexOf('>', temp) + 1;

				temp = line.indexOf('<', start);

				strUname = line.substring(start, temp);// 提取uname

				start = line.indexOf("</a>",temp) + 5;// </a>:长度为5

				temp = line.indexOf('<', start);

				strText = line.substring(start, temp);// 提取状态信息Text

				this.nodeList.add(new InfoNode("新浪微博", strUid, strUname,strText));

				if(this.nodeList.size() == 20){

					break;//主退出条件

				}

			}

		}

		reader.close();

		this.conn.disconnect();

		return this.nodeList;

	}



	//测试代码

	public static void main(String[] args) {

		ICrawler crawler=new SinaCrawler();

		try {

			for(InfoNode iter : crawler.getInfoList()){

				System.out.println(iter);

				System.out.println();//调试使用

			}

		} catch (IOException e) {

			e.printStackTrace();

		}

	}

}