一个关于IKAnalyzer的问题
public static void main(String[] args){
//Lucene Document的域名
String fieldName = "text";
//检索内容
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
String text2 = "中文英文工具包";
//实例化IKAnalyzer分词器
Analyzer analyzer = new IKAnalyzer();
Directory directory = null;
IndexWriter iwriter = null;
IndexSearcher isearcher = null;
try {
//建立内存索引对象
directory = new RAMDirectory();
iwriter = new IndexWriter(directory, analyzer, true , IndexWriter.MaxFieldLength.LIMITED);
Document doc = new Document();
doc.add(new Field("ID", "10000", Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.ANALYZED));
Document doc2 = new Document();
doc2.add(new Field("ID", "10001", Field.Store.YES, Field.Index.NOT_ANALYZED));
doc2.add(new Field(fieldName, text2, Field.Store.YES, Field.Index.ANALYZED));
iwriter.addDocument(doc);
iwriter.addDocument(doc2);
iwriter.close();
//实例化搜索器
isearcher = new IndexSearcher(directory);
//在索引器中使用IKSimilarity相似度评估器
isearcher.setSimilarity(new IKSimilarity());
String keyword = "中文分词工具";
//BooleanClause.Occur[] flags = new BooleanClause.Occur[]{BooleanClause.Occur.MUST_NOT,BooleanClause.Occur.SHOULD};
//使用IKQueryParser查询分析器构造Query对象
Query query = IKQueryParser.parse(fieldName, keyword);
//搜索相似度最高的5条记录
TopDocs topDocs = isearcher.search(query , 5);
System.out.println("命中:" + topDocs.totalHits);
//输出结果
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (int i = 0; i < topDocs.totalHits; i++){
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
System.out.println("内容:" + targetDoc.toString());
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally{
if(isearcher != null){
try {
isearcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(directory != null){
try {
directory.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
这种情况text2的“中文英文工具包”不能命中,但是我测试过使用IK可以把“工具”拆分出来,查询的关键词keyword = "中文分词工具",也可以把“工具”分出来,但是为什么不能查询到呢?
输出结果如下:
命中:1
内容:Document<stored,indexed<ID:10000> stored,indexed,tokenized<text:IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。>>