一道面试题

liuzhe_2008 2010-10-07 03:33:09

给定一个URL 下载这个页面和这个页面中的所有连接怎么做？

...全文

216 13 打赏收藏转发到动态举报

写回复

用AI写文章

13 条回复

切换为时间正序

请发表友善的回复…

发表回复

aijezdm915 2010-10-09

打赏
举报

<[aA][^>]*[hH][rR][eE][fF][^>]*>
???

chenbb110 2010-10-09

打赏
举报

很强大

有一天呵呵 2010-10-07

打赏
举报

呵呵，不错

rjzou2006 2010-10-07

打赏
举报

http://wolfchina.bokee.com/5117147.html

voice1122 2010-10-07

打赏
举报

学习了

龙四 2010-10-07

打赏
举报

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStreamReader;

import java.net.MalformedURLException;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;



public class ObtainLinks {

	private String pageSrc;

	private String url;



	public ObtainLinks(String url) throws MalformedURLException,IOException{

		this.url = url;

		pageSrc = getPageSrc(url);

	}



	/**

	 * 根据strUrl获取网页源文件.

	 * @param strURL

	 * @return 源文件为空，返回空串

	 * @throws MalformedURLException

	 * @throws IOException

	 */

	private String getPageSrc(String strUrl) throws MalformedURLException,IOException {

		StringBuffer sb = new StringBuffer();

		java.net.URL url = new java.net.URL(strUrl);

		BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));

		String line;

		while ((line = in.readLine()) != null) {

			sb.append(line);

		}

		in.close();

		return sb.toString();

	}

	

	/**

	 * 获取网页中所有包含href属性的<a>标签.

	 * @return pageSrc(网页源码)为空，返回null

	 */

	private List<String> getAnchorContent(){

		if(pageSrc == null) {

			return null;

		}

		List<String> list = new ArrayList<String>();

		String regex = "<[a|A][^>]*[h|H][r|R][e|E][f|F][^>]*>";

		Pattern pattern = Pattern.compile(regex);

		Matcher matcher = pattern.matcher(pageSrc);

		while(matcher.find()){

			list.add(matcher.group());

		}

		return list;

	}

	

	

	public List<String> getUrls() {

		List<String> anchorList = getAnchorContent();

		if(anchorList == null){

			return null;

		}

		

		List<String> list = new ArrayList<String>();

		Pattern pattern;

		for(String anchor:anchorList) {

			

			//<a href="www.baidu.com">

			if(anchor.matches(".*[h|H][r|R][e|E][f|F]\\s*=\\s*\"[^\"]*\".*")) {

				pattern = Pattern.compile("[h|H][r|R][e|E][f|F]\\s*=\\s*\"[^\"]+\"");

				Matcher matcher = pattern.matcher(anchor);

				while(matcher.find()){

					list.add(matcher.group());

				}

				continue;

			}

			

			//<a href='www.baidu.com'>

			if(anchor.matches(".*[h|H][r|R][e|E][f|F]\\s*=\\s*\'[^\']*\'.*")) {

				pattern = Pattern.compile("[h|H][r|R][e|E][f|F]\\s*=\\s*\'[^\"]+\'");

				Matcher matcher = pattern.matcher(anchor);

				while(matcher.find()){

					list.add(matcher.group());

				}

				continue;

			}

			

			//<a href=www.baidu.com>

			if(anchor.matches(".*[h|H][r|R][e|E][f|F]\\s*=\\s*[^\\s]*.*")) {

				pattern = Pattern.compile("[h|H][r|R][e|E][f|F]\\s*=\\s*[^\\s]+[\\s+|>]");

				Matcher matcher = pattern.matcher(anchor);

				while(matcher.find()){

					String str = matcher.group();

					list.add(str.substring(0, str.length()-1));

				}

				continue;

			}

		}

		

		

		return list;

	}

	public static void main(String[] args) throws Exception{

		List<String> list = new ObtainLinks("http://sports.sina.com.cn/nba/").getUrls();

		for(String str:list){

			System.out.println(str);

		}

	}

}

thegodofwar 2010-10-07

打赏
举报

[Quote=引用 1 楼 ticmy 的回复:]
Java code
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
……
[/Quote]
类似这种