50,542
社区成员
发帖
与我相关
我的任务
分享
//定义存放索引的目录 。
File indexDir = new File("D:\\luceneIndex");
//测试用法的一种,对目录中的txt文件的内容进行索引,供查询。
File dataDir = new File("D:\\luceneData");
//确定分词的实现方法。这是Lucene自带的分词器
//Analyzer writerAnalyzer = new SimpleAnalyzer(Version.LUCENE_33);
//庖丁解牛 建立中文分词解析
Analyzer writerAnalyzer = new PaodingAnalyzer();
//
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_33, writerAnalyzer);
//设定是对索引增量,还是新建索引。
indexWriterConfig.setOpenMode(OpenMode.CREATE);
//索引写入流
IndexWriter indexWriter = new IndexWriter(FSDirectory.open(indexDir),indexWriterConfig);
File[] dataFiles = dataDir.listFiles();
long startTime = new Date().getTime();
for(int i = 0; i < dataFiles.length; i++){
if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){
System.out.println("Indexing file " + dataFiles[i].getCanonicalPath());
Document document = new Document();
FileInputStream fileInputStream = new FileInputStream(dataFiles[i]);
InputStreamReader reader = new InputStreamReader(fileInputStream,"GBK");
document.add(new Field("path",dataFiles[i].getCanonicalPath(),Field.Store.YES,Field.Index.ANALYZED));
document.add(new Field("filename",dataFiles[i].getName(),Field.Store.YES,Field.Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
document.add(new Field("contents",reader,TermVector.WITH_POSITIONS_OFFSETS));
indexWriter.addDocument(document);
}
}
//对IndexWriter进行优化
indexWriter.optimize();
indexWriter.close();
public static String IndexPath = "D:\\luceneIndex";
String queryStr = "中国中央电视台";
//读取索引
File indexDir = new File(IndexPath);
FSDirectory directory = FSDirectory.open(indexDir);
IndexSearcher searcher = new IndexSearcher(directory);
if (!indexDir.exists()) {
System.out.println("The Lucene index is not exist");
return;
}
//创建查询解析器
QueryParser queryParser = new QueryParser(Version.LUCENE_33,
"contents", new PaodingAnalyzer());
//创建查询对象
Query query = queryParser.parse(queryStr);
// Term term = new Term("contents", queryStr.toLowerCase());
// TermQuery query = new TermQuery(term);
TopDocs topDocs = searcher.search(query, 10);
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (int i = 0; i < scoreDocs.length; i++) {
IndexReader indexReader = IndexReader.open(directory);
Document document = searcher.doc(scoreDocs[i].doc);
System.out.println("Name: " + document.get("filename"));
System.out.println("FilePath: " + document.get("path"));
//高亮处理
String text = ContentReader.readText(document.get("path"));
TermPositionVector tpv = (TermPositionVector) indexReader.getTermFreqVector(
scoreDocs[i].doc, "contents");
TokenStream ts = TokenSources.getTokenStream(tpv);
Formatter formatter = new Formatter() {
@Override
public String highlightTerm(String srcText, TokenGroup g) {
if (g.getTotalScore() <= 0) {
return srcText;
}
return "<b>" + srcText + "</b>";
}
};
Highlighter highlighter = new Highlighter(formatter, new QueryScorer(
query));
String result = highlighter.getBestFragments(ts, text, 5, "…");
System.out.println("result:\n\t" + result);
indexReader.close();
}
}