81,094
社区成员
发帖
与我相关
我的任务
分享
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class ObtainLinks {
private String pageSrc;
private String url;
public ObtainLinks(String url) throws MalformedURLException,IOException{
this.url = url;
pageSrc = getPageSrc(url);
}
/**
* 根据strUrl获取网页源文件.
* @param strURL
* @return 源文件为空,返回空串
* @throws MalformedURLException
* @throws IOException
*/
private String getPageSrc(String strUrl) throws MalformedURLException,IOException {
StringBuffer sb = new StringBuffer();
java.net.URL url = new java.net.URL(strUrl);
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
String line;
while ((line = in.readLine()) != null) {
sb.append(line);
}
in.close();
return sb.toString();
}
/**
* 获取网页中所有包含href属性的<a>标签.
* @return pageSrc(网页源码)为空,返回null
*/
private List<String> getAnchorContent(){
if(pageSrc == null) {
return null;
}
List<String> list = new ArrayList<String>();
String regex = "<[a|A][^>]*[h|H][r|R][e|E][f|F][^>]*>";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(pageSrc);
while(matcher.find()){
list.add(matcher.group());
}
return list;
}
public List<String> getUrls() {
List<String> anchorList = getAnchorContent();
if(anchorList == null){
return null;
}
List<String> list = new ArrayList<String>();
Pattern pattern;
for(String anchor:anchorList) {
//<a href="www.baidu.com">
if(anchor.matches(".*[h|H][r|R][e|E][f|F]\\s*=\\s*\"[^\"]*\".*")) {
pattern = Pattern.compile("[h|H][r|R][e|E][f|F]\\s*=\\s*\"[^\"]+\"");
Matcher matcher = pattern.matcher(anchor);
while(matcher.find()){
list.add(matcher.group());
}
continue;
}
//<a href='www.baidu.com'>
if(anchor.matches(".*[h|H][r|R][e|E][f|F]\\s*=\\s*\'[^\']*\'.*")) {
pattern = Pattern.compile("[h|H][r|R][e|E][f|F]\\s*=\\s*\'[^\"]+\'");
Matcher matcher = pattern.matcher(anchor);
while(matcher.find()){
list.add(matcher.group());
}
continue;
}
//<a href=www.baidu.com>
if(anchor.matches(".*[h|H][r|R][e|E][f|F]\\s*=\\s*[^\\s]*.*")) {
pattern = Pattern.compile("[h|H][r|R][e|E][f|F]\\s*=\\s*[^\\s]+[\\s+|>]");
Matcher matcher = pattern.matcher(anchor);
while(matcher.find()){
String str = matcher.group();
list.add(str.substring(0, str.length()-1));
}
continue;
}
}
return list;
}
public static void main(String[] args) throws Exception{
List<String> list = new ObtainLinks("http://sports.sina.com.cn/nba/").getUrls();
for(String str:list){
System.out.println(str);
}
}
}
<[a|A][^>]*[h|H][r|R][e|E][f|F][^>]*>
的正则表达式写的都不对,比如这个它还能匹配String str="<| agghd||||ggdhhdj>";
这样的字符串,‘或’应该是这样写<[aA][^>]*[hH][rR][eE][fF][^>]*>