/**
*
*/
package com.crawl.zongheng;
import java.util.HashMap;
import com.util.RegexUtil;
import com.crawl.CrawlBase;
/**
* @author Administrator
*
*/
//提取某网站某本书的信息
public class IntroPage extends CrawlBase{
private String url;
private static HashMap<String,String> params;
private static final String AUTHOR ="<meta name=\"og:novel:author\" content=\"(.*?)\"/> ";
private static final String NAME="<meta name=\"og:novel:book_name\" content=\"(.*?)\"/> ";
private static final String DESC = "<meta property=\"og:description\" content=\"(.*?)\"/> ";
private static final String TYPE ="<meta name=\"og:novel:category\" content=\"(.*?)\"/>";
private static final String LASTCHAPTER ="<a class=\"chap\" href=\".*?\">(.*?)<p>";
private static final String WORDCOUNT ="<span title=\"(\\d*?)字\">";
private static final String KEYWORDS ="<div class=\"keyword\">(.*?)</div>";
private static final String KEYWORD ="<a.*?>(.*?)</a>";
private static final String CHAPTERLISTURL ="<meta name=\"og:novel:read_url\" content=\"(.*?)\"/>";
static{
params=new HashMap<String,String>();
params.put("Referer", "http://book.zongheng.com");
params.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4033.400 QQBrowser/9.6.12624.400");
params.put("Host","www.zongheng.com");
}
public IntroPage(String url){
readPageByget(url,params,"utf-8");
this.url=url;
}
/**
* @param args
*/
private String getName(){
return RegexUtil.getFirstString(getPageSourceCode(), NAME, 1);
}
private String getAuthor() {
return RegexUtil.getFirstString(getPageSourceCode(), AUTHOR, 1);
}
private String getDesc() {
return RegexUtil.getFirstString(getPageSourceCode(), DESC, 1);
}
private String getType() {
return RegexUtil.getFirstString(getPageSourceCode(), TYPE, 1);
}
private String getLastCharpter() {
return RegexUtil.getFirstString(getPageSourceCode(), LASTCHAPTER, 1);
}
private int getWordCount() {
String wordCount = RegexUtil.getFirstString(getPageSourceCode(), WORDCOUNT, 1);
return Integer.parseInt(wordCount.trim());
}
private String getKeyWordStr() {
return RegexUtil.getFirstString(getPageSourceCode(), KEYWORDS, 1);
}
private String getKeyWord() {
return RegexUtil.getString(getKeyWordStr(), KEYWORD, " ", 1);
}
private String getChapterListUrl() {
return RegexUtil.getFirstString(getPageSourceCode(), CHAPTERLISTURL, 1);
}
public static void main(String[] args) {
// TODO Auto-generated method stub
IntroPage introPage = new IntroPage("http://book.zongheng.com/book/712549.html");
System.out.println(introPage.getName());
System.out.println(introPage.getAuthor());
System.out.println(introPage.getDesc());
System.out.println(introPage.getType());
System.out.println(introPage.getLastCharpter());
System.out.println(introPage.getWordCount());
System.out.println(introPage.getKeyWord());
System.out.println(introPage.getChapterListUrl());
}
}
输出结果以及报错是:
首先这些字符不知道为什么出现?还有应该有8个结果,可是我这里似乎只显示了5个。以及出现了其他异常。谢谢大神了。